From 81da854903daef56723820a8f68ed5e95db47b60 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:32:33 +0800
Subject: [PATCH 1/9] remove legacy C++ code

---
 paddle/legacy/api/Arguments.cpp               |  174 -
 paddle/legacy/api/CMakeLists.txt              |  120 -
 paddle/legacy/api/ConfigParser.cpp            |  114 -
 paddle/legacy/api/Evaluator.cpp               |   44 -
 paddle/legacy/api/GradientMachine.cpp         |  196 -
 paddle/legacy/api/Internal.h                  |   28 -
 paddle/legacy/api/Matrix.cpp                  |  317 --
 paddle/legacy/api/Paddle.i                    |  202 -
 paddle/legacy/api/PaddleAPI.h                 | 1054 ----
 paddle/legacy/api/PaddleAPIPrivate.h          |   97 -
 paddle/legacy/api/Parameter.cpp               |   68 -
 paddle/legacy/api/ParameterOptimizer.cpp      |  124 -
 paddle/legacy/api/ParameterUpdater.cpp        |   99 -
 paddle/legacy/api/SequenceGenerator.cpp       |  242 -
 paddle/legacy/api/Trainer.cpp                 |  175 -
 paddle/legacy/api/Util.cpp                    |   60 -
 paddle/legacy/api/Vector.cpp                  |  304 --
 paddle/legacy/api/__init__.py                 |   13 -
 paddle/legacy/api/numpy.i                     | 3161 -----------
 paddle/legacy/api/test/.gitignore             |    2 -
 paddle/legacy/api/test/CMakeLists.txt         |   11 -
 paddle/legacy/api/test/testArguments.py       |   54 -
 paddle/legacy/api/test/testGradientMachine.py |  116 -
 paddle/legacy/api/test/testMatrix.py          |  120 -
 paddle/legacy/api/test/testTrain.py           |  116 -
 paddle/legacy/api/test/testTrainConfig.py     |   25 -
 paddle/legacy/api/test/testTrainer.py         |   63 -
 paddle/legacy/api/test/testVector.py          |  153 -
 paddle/legacy/api/test/util.py                |   59 -
 paddle/legacy/capi/Arguments.cpp              |  140 -
 paddle/legacy/capi/CMakeLists.txt             |  118 -
 paddle/legacy/capi/Main.cpp                   |   53 -
 paddle/legacy/capi/Matrix.cpp                 |  171 -
 paddle/legacy/capi/Vector.cpp                 |   69 -
 paddle/legacy/capi/arguments.h                |  171 -
 paddle/legacy/capi/capi.h                     |   32 -
 paddle/legacy/capi/capi_private.h             |   82 -
 paddle/legacy/capi/config.h.in                |   13 -
 paddle/legacy/capi/error.cpp                  |   32 -
 paddle/legacy/capi/error.h                    |   45 -
 paddle/legacy/capi/examples/.gitignore        |    2 -
 paddle/legacy/capi/examples/README.md         |    3 -
 .../capi/examples/model_inference/README.md   |   42 -
 .../examples/model_inference/common/common.h  |   42 -
 .../model_inference/dense/CMakeLists.txt      |    6 -
 .../model_inference/dense/convert_protobin.sh |    2 -
 .../examples/model_inference/dense/main.c     |  116 -
 .../model_inference/dense/merge_v2_model.py   |   22 -
 .../model_inference/dense/mnist_v2.py         |  131 -
 .../model_inference/dense/trainer_config.py   |   13 -
 .../model_inference/multi_thread/.gitignore   |   73 -
 .../multi_thread/CMakeLists.txt               |   29 -
 .../multi_thread/convert_protobin.sh          |    1 -
 .../model_inference/multi_thread/main.c       |  112 -
 .../model_inference/multi_thread/main_gpu.c   |  127 -
 .../multi_thread/trainer_config.py            |   13 -
 .../model_inference/sequence/.gitignore       |   73 -
 .../model_inference/sequence/CMakeLists.txt   |    6 -
 .../sequence/convert_protobin.sh              |    1 -
 .../examples/model_inference/sequence/main.c  |   84 -
 .../sequence/trainer_config.py                |   27 -
 .../model_inference/sparse_binary/.gitignore  |   73 -
 .../sparse_binary/CMakeLists.txt              |    7 -
 .../sparse_binary/convert_protobin.sh         |    1 -
 .../model_inference/sparse_binary/main.c      |   87 -
 .../sparse_binary/trainer_config.py           |   13 -
 paddle/legacy/capi/gradient_machine.cpp       |  180 -
 paddle/legacy/capi/gradient_machine.h         |  127 -
 paddle/legacy/capi/main.h                     |   40 -
 paddle/legacy/capi/matrix.h                   |  146 -
 paddle/legacy/capi/paddle_capi.map            |    6 -
 paddle/legacy/capi/tests/.gitignore           |    2 -
 paddle/legacy/capi/tests/CMakeLists.txt       |   15 -
 paddle/legacy/capi/tests/test_Arguments.cpp   |  129 -
 .../capi/tests/test_GradientMachine.cpp       |  117 -
 paddle/legacy/capi/tests/test_Matrix.cpp      |   93 -
 paddle/legacy/capi/tests/test_Vector.cpp      |   32 -
 .../legacy/capi/tests/test_predict_network.py |   27 -
 paddle/legacy/capi/vector.h                   |   89 -
 paddle/legacy/cuda/CMakeLists.txt             |   89 -
 .../cuda/include/hl_activation_functions.h    |   60 -
 paddle/legacy/cuda/include/hl_aggregate.h     |  106 -
 paddle/legacy/cuda/include/hl_avx_functions.h |   32 -
 paddle/legacy/cuda/include/hl_base.h          |  250 -
 paddle/legacy/cuda/include/hl_batch_norm.h    |   48 -
 .../legacy/cuda/include/hl_batch_transpose.h  |   36 -
 paddle/legacy/cuda/include/hl_cnn.h           |  417 --
 paddle/legacy/cuda/include/hl_cpu_gru.cuh     |  477 --
 paddle/legacy/cuda/include/hl_cpu_lstm.cuh    |  372 --
 .../cuda/include/hl_cpu_matrix_kernel.cuh     |  196 -
 .../include/hl_cpu_matrix_kernel_detail.cuh   |  310 --
 paddle/legacy/cuda/include/hl_cpu_scalar.cuh  |   50 -
 .../legacy/cuda/include/hl_cpu_simd_neon.cuh  |   73 -
 .../legacy/cuda/include/hl_cpu_simd_sse.cuh   |   94 -
 paddle/legacy/cuda/include/hl_cuda.h          |  345 --
 paddle/legacy/cuda/include/hl_cuda.ph         |  112 -
 paddle/legacy/cuda/include/hl_cuda_cublas.h   |  172 -
 paddle/legacy/cuda/include/hl_cuda_cudnn.h    |  516 --
 paddle/legacy/cuda/include/hl_cuda_cudnn.ph   |   80 -
 .../cuda/include/hl_device_functions.cuh      |   71 -
 paddle/legacy/cuda/include/hl_functions.h     |   57 -
 paddle/legacy/cuda/include/hl_gpu.h           |   44 -
 .../legacy/cuda/include/hl_gpu_functions.cuh  |   68 -
 paddle/legacy/cuda/include/hl_gpu_gru.cuh     |  393 --
 paddle/legacy/cuda/include/hl_gpu_lstm.cuh    |  300 --
 .../cuda/include/hl_gpu_matrix_kernel.cuh     |  629 ---
 paddle/legacy/cuda/include/hl_gru_ops.cuh     |  205 -
 paddle/legacy/cuda/include/hl_lstm.h          |  130 -
 paddle/legacy/cuda/include/hl_lstm_ops.cuh    |  213 -
 paddle/legacy/cuda/include/hl_matrix.h        |  311 --
 .../legacy/cuda/include/hl_matrix_apply.cuh   |  423 --
 paddle/legacy/cuda/include/hl_matrix_base.cuh |  164 -
 .../cuda/include/hl_matrix_base_detail.cuh    |  153 -
 paddle/legacy/cuda/include/hl_matrix_ops.cuh  |  253 -
 paddle/legacy/cuda/include/hl_matrix_type.cuh |   51 -
 .../cuda/include/hl_perturbation_util.cuh     |   51 -
 .../cuda/include/hl_recurrent_apply.cuh       |  192 -
 paddle/legacy/cuda/include/hl_sequence.h      |  168 -
 paddle/legacy/cuda/include/hl_sparse.h        |  523 --
 paddle/legacy/cuda/include/hl_sparse.ph       |   85 -
 paddle/legacy/cuda/include/hl_table_apply.h   |   81 -
 paddle/legacy/cuda/include/hl_tensor_ops.h    |  536 --
 paddle/legacy/cuda/include/hl_thread.ph       |   84 -
 paddle/legacy/cuda/include/hl_time.h          |   29 -
 paddle/legacy/cuda/include/hl_top_k.h         |   87 -
 paddle/legacy/cuda/include/hl_warpctc_wrap.h  |   94 -
 .../cuda/include/stub/hl_aggregate_stub.h     |   36 -
 paddle/legacy/cuda/include/stub/hl_cnn_stub.h |  247 -
 .../cuda/include/stub/hl_cuda_cublas_stub.h   |   53 -
 .../cuda/include/stub/hl_cuda_cudnn_stub.h    |  201 -
 .../legacy/cuda/include/stub/hl_cuda_stub.h   |   97 -
 .../legacy/cuda/include/stub/hl_lstm_stub.h   |   67 -
 .../legacy/cuda/include/stub/hl_matrix_stub.h |  138 -
 .../cuda/include/stub/hl_sequence_stub.h      |   66 -
 .../legacy/cuda/include/stub/hl_sparse_stub.h |  185 -
 paddle/legacy/cuda/src/avx_mathfun.h          |  735 ---
 paddle/legacy/cuda/src/hl_avx_functions.cc    |   69 -
 paddle/legacy/cuda/src/hl_batch_norm.cu       |   66 -
 paddle/legacy/cuda/src/hl_batch_transpose.cu  |   59 -
 paddle/legacy/cuda/src/hl_cpu_functions.cc    |   44 -
 paddle/legacy/cuda/src/hl_cuda_aggregate.cu   |  293 -
 paddle/legacy/cuda/src/hl_cuda_cnn.cu         | 1106 ----
 paddle/legacy/cuda/src/hl_cuda_cublas.cc      |  400 --
 paddle/legacy/cuda/src/hl_cuda_cudnn.cc       | 1117 ----
 paddle/legacy/cuda/src/hl_cuda_device.cc      |  681 ---
 paddle/legacy/cuda/src/hl_cuda_lstm.cu        |  876 ---
 paddle/legacy/cuda/src/hl_cuda_matrix.cu      |  806 ---
 paddle/legacy/cuda/src/hl_cuda_sequence.cu    |  408 --
 paddle/legacy/cuda/src/hl_cuda_sparse.cu      | 1262 -----
 paddle/legacy/cuda/src/hl_cuda_sparse.cuh     | 1015 ----
 paddle/legacy/cuda/src/hl_math.cc             |   26 -
 .../legacy/cuda/src/hl_perturbation_util.cu   |  289 -
 paddle/legacy/cuda/src/hl_table_apply.cu      |  124 -
 paddle/legacy/cuda/src/hl_time.cc             |   27 -
 paddle/legacy/cuda/src/hl_top_k.cu            |  481 --
 paddle/legacy/cuda/src/hl_warpctc_wrap.cc     |  151 -
 paddle/legacy/function/BlockExpandOp.cpp      |  202 -
 paddle/legacy/function/BlockExpandOpTest.cpp  |  107 -
 paddle/legacy/function/BufferArg.cpp          |   52 -
 paddle/legacy/function/BufferArg.h            |  364 --
 paddle/legacy/function/BufferArgTest.cpp      |   38 -
 paddle/legacy/function/CMakeLists.txt         |   54 -
 .../legacy/function/ContextProjectionOp.cpp   |  412 --
 paddle/legacy/function/ContextProjectionOp.h  |   86 -
 .../legacy/function/ContextProjectionOpGpu.cu |  413 --
 .../function/ContextProjectionOpTest.cpp      |  114 -
 paddle/legacy/function/ConvOp.h               |  157 -
 paddle/legacy/function/ConvOpTest.h           |  275 -
 paddle/legacy/function/CosSimOp.cpp           |  240 -
 paddle/legacy/function/CosSimOp.h             |   61 -
 paddle/legacy/function/CosSimOpGpu.cu         |  248 -
 paddle/legacy/function/CosSimOpTest.cpp       |   64 -
 paddle/legacy/function/CropOp.cpp             |  177 -
 paddle/legacy/function/CropOp.h               |   51 -
 paddle/legacy/function/CropOpGpu.cu           |  150 -
 paddle/legacy/function/CropOpTest.cpp         |   49 -
 paddle/legacy/function/CrossMapNormalOp.cpp   |  344 --
 paddle/legacy/function/CrossMapNormalOp.h     |   81 -
 paddle/legacy/function/CrossMapNormalOpGpu.cu |  177 -
 .../legacy/function/CrossMapNormalOpTest.cpp  |   80 -
 paddle/legacy/function/DepthwiseConvOp.cpp    |  305 --
 paddle/legacy/function/DepthwiseConvOp.h      |  159 -
 paddle/legacy/function/DepthwiseConvOpGpu.cu  |  376 --
 .../legacy/function/DepthwiseConvOpTest.cpp   |   46 -
 paddle/legacy/function/EigenGemm.cpp          |  102 -
 paddle/legacy/function/EigenThreadDevice.h    |   73 -
 paddle/legacy/function/Function.cpp           |   45 -
 paddle/legacy/function/Function.h             |  214 -
 paddle/legacy/function/FunctionTest.cpp       |  166 -
 paddle/legacy/function/FunctionTest.h         |  410 --
 paddle/legacy/function/GemmConvOp.cpp         |  522 --
 paddle/legacy/function/GemmConvOpTest.cpp     |   50 -
 paddle/legacy/function/GemmFunctor.cpp        |   90 -
 paddle/legacy/function/GemmFunctor.h          |   65 -
 paddle/legacy/function/GruFunctor.h           |  159 -
 paddle/legacy/function/Im2Col.h               |  154 -
 paddle/legacy/function/Im2ColOp.cpp           |  245 -
 paddle/legacy/function/Im2ColOpGpu.cu         |  464 --
 paddle/legacy/function/Im2ColTest.cpp         |  223 -
 paddle/legacy/function/MulOp.cpp              |  347 --
 paddle/legacy/function/MulOp.h                |  102 -
 paddle/legacy/function/MulOpGpu.cu            |  130 -
 paddle/legacy/function/MulOpTest.cpp          |  212 -
 paddle/legacy/function/NaiveConvOp.cpp        |  141 -
 paddle/legacy/function/PadOp.cpp              |  215 -
 paddle/legacy/function/PadOp.h                |   73 -
 paddle/legacy/function/PadOpGpu.cu            |  132 -
 paddle/legacy/function/PadOpTest.cpp          |   49 -
 paddle/legacy/function/RowConvOp.cpp          |  225 -
 paddle/legacy/function/RowConvOp.h            |   56 -
 paddle/legacy/function/RowConvOpGpu.cu        |  373 --
 paddle/legacy/function/RowConvOpTest.cpp      |   62 -
 paddle/legacy/function/ScaleSubRegionOp.cpp   |  155 -
 paddle/legacy/function/ScaleSubRegionOp.h     |   55 -
 paddle/legacy/function/ScaleSubRegionOpGpu.cu |  116 -
 .../legacy/function/ScaleSubRegionOpTest.cpp  |   72 -
 paddle/legacy/function/SwitchOp.cpp           |  140 -
 paddle/legacy/function/SwitchOp.h             |   66 -
 paddle/legacy/function/SwitchOpGpu.cu         |   98 -
 paddle/legacy/function/SwitchOpTest.cpp       |   44 -
 paddle/legacy/function/TensorShape.h          |  107 -
 paddle/legacy/function/TensorShapeTest.cpp    |   53 -
 paddle/legacy/function/TensorType.h           |  149 -
 paddle/legacy/function/TensorTypeTest.cpp     |   64 -
 .../function/neon/NeonDepthwiseConv.cpp       |  120 -
 .../legacy/function/neon/NeonDepthwiseConv.h  |  627 ---
 .../neon/NeonDepthwiseConvTranspose.cpp       |  136 -
 paddle/legacy/function/neon/neon_util.h       |   43 -
 .../legacy/function/nnpack/NNPACKConvOp.cpp   |  247 -
 .../function/nnpack/NNPACKConvOpTest.cpp      |   30 -
 paddle/legacy/gserver/CMakeLists.txt          |  152 -
 .../activations/ActivationFunction.cpp        |  509 --
 .../gserver/activations/ActivationFunction.h  |   66 -
 .../gserver/activations/MKLDNNActivation.cpp  |  249 -
 .../gserver/activations/MKLDNNActivation.h    |  119 -
 .../gserver/dataproviders/DataProvider.cpp    |  410 --
 .../gserver/dataproviders/DataProvider.h      |  480 --
 .../gserver/dataproviders/DataProviderGroup.h |  153 -
 .../dataproviders/MultiDataProvider.cpp       |  122 -
 .../gserver/dataproviders/MultiDataProvider.h |   41 -
 .../gserver/dataproviders/ProtoReader.h       |  177 -
 .../gserver/dataproviders/PyDataProvider.cpp  |  498 --
 .../gserver/dataproviders/PyDataProvider.h    |  124 -
 .../gserver/dataproviders/PyDataProvider2.cpp | 1031 ----
 .../gserver/evaluators/CTCErrorEvaluator.cpp  |  320 --
 .../gserver/evaluators/ChunkEvaluator.cpp     |  296 -
 .../evaluators/DetectionMAPEvaluator.cpp      |  308 --
 .../legacy/gserver/evaluators/Evaluator.cpp   | 1361 -----
 paddle/legacy/gserver/evaluators/Evaluator.h  |  510 --
 .../gradientmachines/GradientMachine.cpp      |  104 -
 .../gradientmachines/GradientMachine.h        |  250 -
 .../gradientmachines/GradientMachineMode.cpp  |   20 -
 .../gradientmachines/GradientMachineMode.h    |  149 -
 .../gradientmachines/MultiGradientMachine.cpp |  898 ----
 .../gradientmachines/MultiGradientMachine.h   |  478 --
 .../gserver/gradientmachines/MultiNetwork.cpp |  185 -
 .../gserver/gradientmachines/MultiNetwork.h   |   64 -
 .../gradientmachines/NeuralNetwork.cpp        |  548 --
 .../gserver/gradientmachines/NeuralNetwork.h  |  179 -
 .../ParallelNeuralNetwork.cpp                 |  214 -
 .../gradientmachines/ParallelNeuralNetwork.h  |  113 -
 .../RecurrentGradientMachine.cpp              | 1501 ------
 .../RecurrentGradientMachine.h                |  580 --
 paddle/legacy/gserver/layers/AddtoLayer.cpp   |   79 -
 paddle/legacy/gserver/layers/AddtoLayer.h     |   63 -
 paddle/legacy/gserver/layers/AgentLayer.cpp   |  281 -
 paddle/legacy/gserver/layers/AgentLayer.h     |  177 -
 paddle/legacy/gserver/layers/AverageLayer.cpp |   67 -
 paddle/legacy/gserver/layers/AverageLayer.h   |   54 -
 .../gserver/layers/BatchNormBaseLayer.cpp     |   80 -
 .../gserver/layers/BatchNormBaseLayer.h       |  101 -
 .../layers/BatchNormalizationLayer.cpp        |  266 -
 .../gserver/layers/BatchNormalizationLayer.h  |   70 -
 .../gserver/layers/BilinearInterpLayer.cpp    |  107 -
 .../gserver/layers/BilinearInterpLayer.h      |   47 -
 .../gserver/layers/BlockExpandLayer.cpp       |  121 -
 .../legacy/gserver/layers/BlockExpandLayer.h  |   68 -
 .../gserver/layers/CRFDecodingLayer.cpp       |   69 -
 .../legacy/gserver/layers/CRFDecodingLayer.h  |   44 -
 paddle/legacy/gserver/layers/CRFLayer.cpp     |  117 -
 paddle/legacy/gserver/layers/CRFLayer.h       |   46 -
 paddle/legacy/gserver/layers/CTCLayer.cpp     |  121 -
 paddle/legacy/gserver/layers/CTCLayer.h       |   41 -
 paddle/legacy/gserver/layers/ClipLayer.cpp    |   79 -
 .../gserver/layers/ConcatenateLayer.cpp       |  208 -
 .../gserver/layers/ContextProjection.cpp      |  185 -
 .../legacy/gserver/layers/ContextProjection.h |   78 -
 paddle/legacy/gserver/layers/Conv3DLayer.cpp  |  253 -
 paddle/legacy/gserver/layers/Conv3DLayer.h    |   51 -
 .../legacy/gserver/layers/ConvBaseLayer.cpp   |  120 -
 paddle/legacy/gserver/layers/ConvBaseLayer.h  |  107 -
 .../gserver/layers/ConvBaseOperator.cpp       |  151 -
 .../legacy/gserver/layers/ConvBaseOperator.h  |  112 -
 .../gserver/layers/ConvBaseProjection.cpp     |  199 -
 .../gserver/layers/ConvBaseProjection.h       |  111 -
 paddle/legacy/gserver/layers/ConvOperator.cpp |  128 -
 paddle/legacy/gserver/layers/ConvOperator.h   |   44 -
 .../legacy/gserver/layers/ConvProjection.cpp  |  123 -
 paddle/legacy/gserver/layers/ConvProjection.h |   43 -
 .../legacy/gserver/layers/ConvShiftLayer.cpp  |  108 -
 .../gserver/layers/ConvTransOperator.cpp      |  125 -
 .../legacy/gserver/layers/ConvTransOperator.h |   44 -
 .../gserver/layers/ConvTransProjection.cpp    |  123 -
 .../gserver/layers/ConvTransProjection.h      |   43 -
 .../gserver/layers/ConvexCombinationLayer.cpp |  155 -
 paddle/legacy/gserver/layers/CosSimLayer.cpp  |   93 -
 paddle/legacy/gserver/layers/CosSimLayer.h    |   48 -
 .../gserver/layers/CosSimVecMatLayer.cpp      |  182 -
 paddle/legacy/gserver/layers/CostLayer.cpp    |  748 ---
 paddle/legacy/gserver/layers/CostLayer.h      |  374 --
 paddle/legacy/gserver/layers/CropLayer.cpp    |  146 -
 paddle/legacy/gserver/layers/CropLayer.h      |   52 -
 .../gserver/layers/CrossChannelNormLayer.cpp  |  137 -
 .../gserver/layers/CrossEntropyOverBeam.cpp   |  393 --
 .../gserver/layers/CrossEntropyOverBeam.h     |  135 -
 .../gserver/layers/CudnnBatchNormLayer.cpp    |  180 -
 .../gserver/layers/CudnnBatchNormLayer.h      |   68 -
 .../gserver/layers/CudnnConvBaseLayer.cpp     |  135 -
 .../gserver/layers/CudnnConvBaseLayer.h       |   53 -
 .../legacy/gserver/layers/CudnnPoolLayer.cpp  |  139 -
 paddle/legacy/gserver/layers/CudnnPoolLayer.h |   61 -
 paddle/legacy/gserver/layers/DataLayer.cpp    |   67 -
 paddle/legacy/gserver/layers/DataLayer.h      |   70 -
 .../legacy/gserver/layers/DataNormLayer.cpp   |  140 -
 paddle/legacy/gserver/layers/DataNormLayer.h  |   62 -
 .../legacy/gserver/layers/DeConv3DLayer.cpp   |  220 -
 paddle/legacy/gserver/layers/DeConv3DLayer.h  |   52 -
 .../gserver/layers/DetectionOutputLayer.cpp   |  160 -
 .../gserver/layers/DetectionOutputLayer.h     |   77 -
 .../legacy/gserver/layers/DetectionUtil.cpp   |  576 --
 paddle/legacy/gserver/layers/DetectionUtil.h  |  307 --
 .../legacy/gserver/layers/DotMulOperator.cpp  |   62 -
 .../gserver/layers/DotMulProjection.cpp       |   68 -
 paddle/legacy/gserver/layers/DotProdLayer.cpp |   97 -
 .../legacy/gserver/layers/EosIdCheckLayer.cpp |   50 -
 .../legacy/gserver/layers/ExpandConvLayer.cpp |  248 -
 .../legacy/gserver/layers/ExpandConvLayer.h   |   51 -
 paddle/legacy/gserver/layers/ExpandLayer.cpp  |  133 -
 paddle/legacy/gserver/layers/ExpandLayer.h    |   63 -
 .../layers/FactorizationMachineLayer.cpp      |  158 -
 .../layers/FactorizationMachineLayer.h        |   80 -
 .../gserver/layers/FeatureMapExpandLayer.cpp  |  155 -
 .../gserver/layers/FullMatrixProjection.cpp   |   60 -
 .../gserver/layers/FullMatrixProjection.h     |   42 -
 .../gserver/layers/FullyConnectedLayer.cpp    |  150 -
 .../gserver/layers/FullyConnectedLayer.h      |   49 -
 .../gserver/layers/GatedRecurrentLayer.cpp    |  414 --
 .../gserver/layers/GatedRecurrentLayer.h      |  100 -
 .../legacy/gserver/layers/GetOutputLayer.cpp  |   41 -
 paddle/legacy/gserver/layers/GruCompute.cpp   |   54 -
 paddle/legacy/gserver/layers/GruCompute.cu    |   47 -
 paddle/legacy/gserver/layers/GruCompute.h     |   41 -
 paddle/legacy/gserver/layers/GruStepLayer.cpp |  177 -
 .../layers/HierarchicalSigmoidLayer.cpp       |  240 -
 .../gserver/layers/HierarchicalSigmoidLayer.h |   94 -
 .../gserver/layers/IdentityProjection.cpp     |  103 -
 .../gserver/layers/InterpolationLayer.cpp     |  130 -
 .../gserver/layers/KmaxSeqScoreLayer.cpp      |  126 -
 .../legacy/gserver/layers/L2DistanceLayer.cpp |   91 -
 .../legacy/gserver/layers/L2DistanceLayer.h   |   52 -
 paddle/legacy/gserver/layers/Layer.cpp        |  410 --
 paddle/legacy/gserver/layers/Layer.h          |  512 --
 .../legacy/gserver/layers/LinearChainCRF.cpp  |  218 -
 paddle/legacy/gserver/layers/LinearChainCRF.h |   97 -
 .../legacy/gserver/layers/LinearChainCTC.cpp  |  265 -
 paddle/legacy/gserver/layers/LinearChainCTC.h |   50 -
 paddle/legacy/gserver/layers/LstmCompute.cpp  |   93 -
 paddle/legacy/gserver/layers/LstmCompute.cu   |   73 -
 paddle/legacy/gserver/layers/LstmCompute.h    |   66 -
 paddle/legacy/gserver/layers/LstmLayer.cpp    |  805 ---
 paddle/legacy/gserver/layers/LstmLayer.h      |  221 -
 .../legacy/gserver/layers/LstmStepLayer.cpp   |  194 -
 paddle/legacy/gserver/layers/MDLstmLayer.cpp  |  769 ---
 .../gserver/layers/MKLDNNAddtoLayer.cpp       |  219 -
 .../legacy/gserver/layers/MKLDNNAddtoLayer.h  |   87 -
 paddle/legacy/gserver/layers/MKLDNNBase.h     |   97 -
 .../gserver/layers/MKLDNNBatchNormLayer.cpp   |  306 --
 .../gserver/layers/MKLDNNBatchNormLayer.h     |  125 -
 .../gserver/layers/MKLDNNConcatLayer.cpp      |  186 -
 .../legacy/gserver/layers/MKLDNNConcatLayer.h |   96 -
 .../legacy/gserver/layers/MKLDNNConvLayer.cpp |  388 --
 .../legacy/gserver/layers/MKLDNNConvLayer.h   |  161 -
 .../legacy/gserver/layers/MKLDNNFcLayer.cpp   |  262 -
 paddle/legacy/gserver/layers/MKLDNNFcLayer.h  |  107 -
 .../legacy/gserver/layers/MKLDNNLRNLayer.cpp  |  163 -
 paddle/legacy/gserver/layers/MKLDNNLRNLayer.h |   78 -
 paddle/legacy/gserver/layers/MKLDNNLayer.cpp  |  304 --
 paddle/legacy/gserver/layers/MKLDNNLayer.h    |  477 --
 .../legacy/gserver/layers/MKLDNNPoolLayer.cpp |  195 -
 .../legacy/gserver/layers/MKLDNNPoolLayer.h   |  110 -
 .../layers/MKLPackedRecurrentLayer.cpp        |  132 -
 .../gserver/layers/MKLPackedRecurrentLayer.h  |   58 -
 .../legacy/gserver/layers/MKLPackedWeight.h   |   86 -
 paddle/legacy/gserver/layers/MaxIdLayer.cpp   |   62 -
 paddle/legacy/gserver/layers/MaxLayer.cpp     |   65 -
 paddle/legacy/gserver/layers/MaxLayer.h       |   58 -
 paddle/legacy/gserver/layers/MaxOutLayer.cpp  |   87 -
 paddle/legacy/gserver/layers/MaxOutLayer.h    |   55 -
 .../gserver/layers/MaxPoolWithMaskLayer.cpp   |  109 -
 .../gserver/layers/MaxPoolWithMaskLayer.h     |   40 -
 paddle/legacy/gserver/layers/MixedLayer.cpp   |  176 -
 paddle/legacy/gserver/layers/MixedLayer.h     |   63 -
 .../gserver/layers/MultiBoxLossLayer.cpp      |  376 --
 .../legacy/gserver/layers/MultiBoxLossLayer.h |  103 -
 .../gserver/layers/MultinomialSampler.cpp     |   86 -
 .../gserver/layers/MultinomialSampler.h       |   81 -
 .../legacy/gserver/layers/MultiplexLayer.cpp  |  180 -
 paddle/legacy/gserver/layers/NCELayer.cpp     |  323 --
 paddle/legacy/gserver/layers/NormLayer.cpp    |   59 -
 paddle/legacy/gserver/layers/NormLayer.h      |   99 -
 .../gserver/layers/NormProjectionLayer.cpp    |  101 -
 .../gserver/layers/NormProjectionLayer.h      |   47 -
 paddle/legacy/gserver/layers/Operator.cpp     |   25 -
 paddle/legacy/gserver/layers/Operator.h       |   96 -
 .../legacy/gserver/layers/OuterProdLayer.cpp  |  141 -
 paddle/legacy/gserver/layers/PadLayer.cpp     |  106 -
 paddle/legacy/gserver/layers/PadLayer.h       |   47 -
 .../gserver/layers/ParameterReluLayer.cpp     |   69 -
 .../gserver/layers/ParameterReluLayer.h       |   65 -
 paddle/legacy/gserver/layers/Pool3DLayer.cpp  |  178 -
 paddle/legacy/gserver/layers/Pool3DLayer.h    |   49 -
 paddle/legacy/gserver/layers/PoolLayer.cpp    |   70 -
 paddle/legacy/gserver/layers/PoolLayer.h      |   55 -
 .../legacy/gserver/layers/PoolProjection.cpp  |  175 -
 paddle/legacy/gserver/layers/PoolProjection.h |   68 -
 .../gserver/layers/PoolProjectionLayer.cpp    |   65 -
 .../gserver/layers/PoolProjectionLayer.h      |   46 -
 paddle/legacy/gserver/layers/PowerLayer.cpp   |  120 -
 paddle/legacy/gserver/layers/PrintLayer.cpp   |   68 -
 paddle/legacy/gserver/layers/PriorBox.cpp     |  159 -
 paddle/legacy/gserver/layers/Projection.cpp   |   32 -
 paddle/legacy/gserver/layers/Projection.h     |  140 -
 paddle/legacy/gserver/layers/ROIPoolLayer.cpp |  233 -
 paddle/legacy/gserver/layers/ROIPoolLayer.h   |   56 -
 .../legacy/gserver/layers/RecurrentLayer.cpp  |  301 --
 paddle/legacy/gserver/layers/RecurrentLayer.h |  130 -
 .../gserver/layers/RecurrentLayerGroup.cpp    |   95 -
 paddle/legacy/gserver/layers/ResizeLayer.cpp  |   79 -
 paddle/legacy/gserver/layers/RotateLayer.cpp  |  102 -
 paddle/legacy/gserver/layers/RotateLayer.h    |   51 -
 paddle/legacy/gserver/layers/RowConvLayer.cpp |  106 -
 paddle/legacy/gserver/layers/RowConvLayer.h   |   44 -
 .../legacy/gserver/layers/RowL2NormLayer.cpp  |   98 -
 .../legacy/gserver/layers/SamplingIdLayer.cpp |   91 -
 .../legacy/gserver/layers/ScaleShiftLayer.cpp |  107 -
 .../gserver/layers/ScaleSubRegionLayer.cpp    |   78 -
 .../gserver/layers/ScaleSubRegionLayer.h      |   52 -
 paddle/legacy/gserver/layers/ScalingLayer.cpp |  106 -
 .../gserver/layers/ScalingProjection.cpp      |   57 -
 .../layers/SelectiveFullyConnectedLayer.cpp   |  336 --
 .../layers/SelectiveFullyConnectedLayer.h     |  103 -
 .../gserver/layers/SequenceConcatLayer.cpp    |  189 -
 .../layers/SequenceLastInstanceLayer.cpp      |  118 -
 .../gserver/layers/SequencePoolLayer.cpp      |   93 -
 .../legacy/gserver/layers/SequencePoolLayer.h |   64 -
 .../gserver/layers/SequenceReshapeLayer.cpp   |  157 -
 .../gserver/layers/SequenceSliceLayer.cpp     |  224 -
 .../legacy/gserver/layers/SequenceToBatch.cpp |  256 -
 .../legacy/gserver/layers/SequenceToBatch.h   |  107 -
 .../legacy/gserver/layers/SliceProjection.cpp |   96 -
 .../gserver/layers/SlopeInterceptLayer.cpp    |   94 -
 .../layers/SpatialPyramidPoolLayer.cpp        |  134 -
 .../gserver/layers/SpatialPyramidPoolLayer.h  |   59 -
 .../gserver/layers/SubNestedSequenceLayer.cpp |  187 -
 .../gserver/layers/SubSequenceLayer.cpp       |  226 -
 .../gserver/layers/SumToOneNormLayer.cpp      |  120 -
 .../gserver/layers/SwitchOrderLayer.cpp       |  109 -
 .../legacy/gserver/layers/SwitchOrderLayer.h  |   47 -
 .../legacy/gserver/layers/TableProjection.cpp |   51 -
 .../legacy/gserver/layers/TableProjection.h   |   50 -
 paddle/legacy/gserver/layers/TensorLayer.cpp  |  145 -
 paddle/legacy/gserver/layers/TensorLayer.h    |   55 -
 paddle/legacy/gserver/layers/TransLayer.cpp   |   69 -
 paddle/legacy/gserver/layers/TransLayer.h     |   41 -
 .../layers/TransposedFullMatrixProjection.cpp |   80 -
 .../legacy/gserver/layers/UpsampleLayer.cpp   |  108 -
 paddle/legacy/gserver/layers/UpsampleLayer.h  |   53 -
 .../legacy/gserver/layers/ValidationLayer.cpp |  171 -
 .../legacy/gserver/layers/ValidationLayer.h   |  104 -
 paddle/legacy/gserver/layers/WarpCTCLayer.cpp |  222 -
 paddle/legacy/gserver/layers/WarpCTCLayer.h   |   66 -
 paddle/legacy/gserver/tests/.gitignore        |    1 -
 paddle/legacy/gserver/tests/CMakeLists.txt    |  103 -
 paddle/legacy/gserver/tests/LayerGradUtil.cpp |  854 ---
 paddle/legacy/gserver/tests/LayerGradUtil.h   |  329 --
 paddle/legacy/gserver/tests/MKLDNNTester.cpp  |  580 --
 paddle/legacy/gserver/tests/MKLDNNTester.h    |  143 -
 .../legacy/gserver/tests/Sequence/dummy.list  |    1 -
 .../tests/Sequence/tour_dict_phrase.dict      |  158 -
 .../gserver/tests/Sequence/tour_train_wdseg   |   10 -
 .../tests/Sequence/tour_train_wdseg.nest      |   14 -
 .../legacy/gserver/tests/Sequence/train.list  |    1 -
 .../gserver/tests/Sequence/train.list.nest    |    1 -
 paddle/legacy/gserver/tests/__init__.py       |   13 -
 .../legacy/gserver/tests/concat_dotmul_a.conf |   31 -
 .../legacy/gserver/tests/concat_dotmul_b.conf |   29 -
 .../gserver/tests/concat_fullmatrix_a.conf    |   35 -
 .../gserver/tests/concat_fullmatrix_b.conf    |   29 -
 .../legacy/gserver/tests/concat_slice_a.conf  |   41 -
 .../legacy/gserver/tests/concat_slice_b.conf  |   41 -
 .../legacy/gserver/tests/concat_table_a.conf  |   32 -
 .../legacy/gserver/tests/concat_table_b.conf  |   29 -
 paddle/legacy/gserver/tests/img_conv_a.conf   |   40 -
 paddle/legacy/gserver/tests/img_conv_b.conf   |   32 -
 paddle/legacy/gserver/tests/img_conv_c.conf   |   43 -
 paddle/legacy/gserver/tests/img_conv_cudnn.py |   31 -
 .../legacy/gserver/tests/img_conv_exconv.py   |   31 -
 paddle/legacy/gserver/tests/img_pool_a.conf   |   44 -
 paddle/legacy/gserver/tests/img_pool_b.conf   |   44 -
 .../gserver/tests/mkldnn_branch_net.conf      |  142 -
 .../gserver/tests/mkldnn_simple_net.conf      |   66 -
 paddle/legacy/gserver/tests/pyDataProvider.py |  146 -
 .../tests/pyDataProvider/pyDataProviderList   |    0
 .../gserver/tests/pyDataProvider/trainer.conf |   75 -
 .../legacy/gserver/tests/rnn_data_provider.py |  115 -
 paddle/legacy/gserver/tests/sequenceGen.py    |   70 -
 .../gserver/tests/sequence_layer_group.conf   |   62 -
 .../legacy/gserver/tests/sequence_lstm.conf   |   64 -
 .../tests/sequence_nest_layer_group.conf      |   83 -
 .../gserver/tests/sequence_nest_rnn.conf      |   74 -
 .../tests/sequence_nest_rnn_multi_input.conf  |   76 -
 ...ence_nest_rnn_multi_unequalength_inputs.py |   96 -
 .../gserver/tests/sequence_recurrent.py       |   55 -
 .../gserver/tests/sequence_recurrent_group.py |   68 -
 paddle/legacy/gserver/tests/sequence_rnn.conf |   57 -
 .../tests/sequence_rnn_matched_inputs.py      |   84 -
 .../tests/sequence_rnn_mixed_inputs.py        |   78 -
 .../tests/sequence_rnn_multi_input.conf       |   58 -
 .../sequence_rnn_multi_unequalength_inputs.py |   76 -
 .../gserver/tests/test_ActivationGrad.cpp     |   98 -
 .../legacy/gserver/tests/test_BatchNorm.cpp   |  195 -
 .../gserver/tests/test_CRFLayerGrad.cpp       |  173 -
 .../gserver/tests/test_CompareSparse.cpp      |  228 -
 .../gserver/tests/test_CompareTwoNets.cpp     |  210 -
 .../legacy/gserver/tests/test_ConvTrans.cpp   |  244 -
 .../legacy/gserver/tests/test_ConvUnify.cpp   |  315 --
 .../tests/test_CrossEntropyOverBeamGrad.cpp   |  352 --
 .../gserver/tests/test_DetectionOutput.cpp    |  194 -
 .../legacy/gserver/tests/test_Evaluator.cpp   |  267 -
 paddle/legacy/gserver/tests/test_Expand.cpp   |  127 -
 .../gserver/tests/test_KmaxSeqScore.cpp       |  164 -
 .../legacy/gserver/tests/test_LayerGrad.cpp   | 2532 ---------
 .../gserver/tests/test_LinearChainCRF.cpp     |   67 -
 paddle/legacy/gserver/tests/test_MKLDNN.cpp   |  448 --
 .../tests/test_MaxPoolingWithMaskOutput.cpp   |  117 -
 .../gserver/tests/test_MultinomialSampler.cpp |  147 -
 .../gserver/tests/test_NetworkCompare.cpp     |  294 -
 paddle/legacy/gserver/tests/test_PriorBox.cpp |  212 -
 .../gserver/tests/test_PyDataProvider.cpp     |  177 -
 .../gserver/tests/test_PyDataProvider2.cpp    |  409 --
 .../gserver/tests/test_PyDataProvider2.py     |  125 -
 .../tests/test_RecurrentGradientMachine.cpp   |  180 -
 .../gserver/tests/test_RecurrentLayer.cpp     |  571 --
 .../gserver/tests/test_SelectiveFCLayer.cpp   |  471 --
 .../gserver/tests/test_SeqSliceLayerGrad.cpp  |  224 -
 paddle/legacy/gserver/tests/test_Upsample.cpp |  153 -
 .../gserver/tests/test_WarpCTCLayer.cpp       |  244 -
 paddle/legacy/math/Allocator.h                |  137 -
 paddle/legacy/math/BaseMatrix.cu              | 1953 -------
 paddle/legacy/math/BaseMatrix.h               | 1095 ----
 paddle/legacy/math/CMakeLists.txt             |   57 -
 paddle/legacy/math/CpuSparseMatrix.cpp        |  787 ---
 paddle/legacy/math/CpuSparseMatrix.h          |  377 --
 paddle/legacy/math/ExecViaCpu.h               |  195 -
 paddle/legacy/math/MKLDNNMatrix.cpp           |  158 -
 paddle/legacy/math/MKLDNNMatrix.h             |  256 -
 paddle/legacy/math/MathFunctions.cpp          |  348 --
 paddle/legacy/math/MathFunctions.h            |  129 -
 paddle/legacy/math/MathUtils.cpp              |   97 -
 paddle/legacy/math/MathUtils.h                |   70 -
 paddle/legacy/math/Matrix.cpp                 | 4787 -----------------
 paddle/legacy/math/Matrix.h                   | 2189 --------
 paddle/legacy/math/MatrixBitCode.cpp          |  291 -
 paddle/legacy/math/MemoryHandle.cpp           |   56 -
 paddle/legacy/math/MemoryHandle.h             |   65 -
 paddle/legacy/math/NEONFunctions.cpp          |   95 -
 paddle/legacy/math/NEONFunctions.h            |   24 -
 paddle/legacy/math/PoolAllocator.cpp          |   83 -
 paddle/legacy/math/PoolAllocator.h            |   61 -
 paddle/legacy/math/RowBuffer.h                |  139 -
 paddle/legacy/math/SIMDFunctions.cpp          |  397 --
 paddle/legacy/math/SIMDFunctions.h            |  179 -
 paddle/legacy/math/SparseMatrix.cpp           |  864 ---
 paddle/legacy/math/SparseMatrix.h             |  286 -
 paddle/legacy/math/SparseRowMatrix.cpp        |  282 -
 paddle/legacy/math/SparseRowMatrix.h          |  341 --
 paddle/legacy/math/Storage.cpp                |  101 -
 paddle/legacy/math/Storage.h                  |   52 -
 paddle/legacy/math/TensorApply.h              |  211 -
 paddle/legacy/math/TensorAssign.h             |  158 -
 paddle/legacy/math/TensorEvaluate.h           |  112 -
 paddle/legacy/math/TensorExpression.h         |  446 --
 paddle/legacy/math/TrainingAlgorithmOp.cu     |  356 --
 paddle/legacy/math/TrainingAlgorithmOp.h      |  122 -
 paddle/legacy/math/Vector.cpp                 | 1091 ----
 paddle/legacy/math/Vector.h                   |  726 ---
 paddle/legacy/math/tests/CMakeLists.txt       |   35 -
 .../legacy/math/tests/OriginalOptimizerApi.h  |  201 -
 paddle/legacy/math/tests/PerfUtils.h          |   46 -
 paddle/legacy/math/tests/TensorCheck.h        |  216 -
 paddle/legacy/math/tests/TestUtils.h          |  294 -
 paddle/legacy/math/tests/test_Allocator.cpp   |  122 -
 paddle/legacy/math/tests/test_BaseMatrix.cpp  |  247 -
 .../legacy/math/tests/test_CpuGpuVector.cpp   |   80 -
 paddle/legacy/math/tests/test_ExecViaCpu.cpp  |  116 -
 paddle/legacy/math/tests/test_FPException.cpp |   93 -
 paddle/legacy/math/tests/test_GpuProfiler.cpp |  165 -
 paddle/legacy/math/tests/test_Matrix.cpp      |  273 -
 paddle/legacy/math/tests/test_RowBuffer.cpp   |   65 -
 .../legacy/math/tests/test_SIMDFunctions.cpp  |  171 -
 .../legacy/math/tests/test_SparseMatrix.cpp   |  565 --
 paddle/legacy/math/tests/test_Tensor.cu       | 1162 ----
 .../math/tests/test_TrainingAlgorithm.cpp     |  461 --
 .../legacy/math/tests/test_batchTranspose.cpp |   55 -
 paddle/legacy/math/tests/test_lazyAssign.cu   |  147 -
 .../legacy/math/tests/test_matrixCompare.cpp  | 1698 ------
 paddle/legacy/math/tests/test_matrixUtil.h    |  233 -
 .../legacy/math/tests/test_perturbation.cpp   |  318 --
 .../math/tests/test_sparseMatrixCompare.cpp   |  174 -
 paddle/legacy/optimizer/CMakeLists.txt        |   16 -
 paddle/legacy/optimizer/adadelta_optimizer.cc |   69 -
 paddle/legacy/optimizer/adadelta_optimizer.h  |   53 -
 paddle/legacy/optimizer/adagrad_optimizer.cc  |   57 -
 paddle/legacy/optimizer/adagrad_optimizer.h   |   46 -
 paddle/legacy/optimizer/adam_optimizer.cc     |   63 -
 paddle/legacy/optimizer/adam_optimizer.h      |   55 -
 paddle/legacy/optimizer/lr_policy.h           |   82 -
 paddle/legacy/optimizer/optimizer.cc          |  106 -
 paddle/legacy/optimizer/optimizer.h           |  107 -
 .../legacy/optimizer/parameter_optimizer.cc   |   92 -
 paddle/legacy/optimizer/parameter_optimizer.h |   56 -
 .../optimizer/parameter_optimizer_test.cc     |  127 -
 paddle/legacy/optimizer/serialization.h       |   49 -
 paddle/legacy/optimizer/serialization_test.cc |   46 -
 paddle/legacy/optimizer/sgd_optimizer.cc      |   65 -
 paddle/legacy/optimizer/sgd_optimizer.h       |   50 -
 paddle/legacy/optimizer/tensor.h              |   68 -
 paddle/legacy/parameter/Argument.cpp          |  707 ---
 paddle/legacy/parameter/Argument.h            |  349 --
 paddle/legacy/parameter/AverageOptimizer.cpp  |  206 -
 paddle/legacy/parameter/AverageOptimizer.h    |  145 -
 paddle/legacy/parameter/CMakeLists.txt        |   11 -
 .../legacy/parameter/FirstOrderOptimizer.cpp  |  330 --
 paddle/legacy/parameter/FirstOrderOptimizer.h |  381 --
 .../parameter/LearningRateScheduler.cpp       |  173 -
 .../legacy/parameter/LearningRateScheduler.h  |   37 -
 .../legacy/parameter/OptimizerFunctions.cpp   |   50 -
 paddle/legacy/parameter/OptimizerFunctions.h  |   43 -
 .../parameter/OptimizerWithRegularizer.cpp    |  193 -
 .../parameter/OptimizerWithRegularizer.h      |  157 -
 paddle/legacy/parameter/Parameter.cpp         |  425 --
 paddle/legacy/parameter/Parameter.h           |  380 --
 .../legacy/parameter/ParameterOptimizer.cpp   |   63 -
 paddle/legacy/parameter/ParameterOptimizer.h  |  211 -
 .../parameter/ParameterUpdateFunctions.cpp    |  300 --
 .../parameter/ParameterUpdateFunctions.h      |   56 -
 .../legacy/parameter/ParameterUpdaterBase.cpp |   41 -
 .../legacy/parameter/ParameterUpdaterBase.h   |  182 -
 .../legacy/parameter/ParameterUpdaterHook.cpp |  155 -
 .../legacy/parameter/ParameterUpdaterHook.h   |   63 -
 paddle/legacy/parameter/Regularizer.cpp       |   54 -
 paddle/legacy/parameter/Regularizer.h         |  115 -
 paddle/legacy/parameter/ThreadLocalBuffer.cpp |   35 -
 paddle/legacy/parameter/ThreadLocalBuffer.h   |   22 -
 paddle/legacy/parameter/Weight.cpp            |   84 -
 paddle/legacy/parameter/Weight.h              |   48 -
 paddle/legacy/parameter/tests/CMakeLists.txt  |    2 -
 .../legacy/parameter/tests/test_argument.cpp  |   57 -
 paddle/legacy/parameter/tests/test_common.cpp |  174 -
 paddle/legacy/pserver/BaseClient.cpp          |   80 -
 paddle/legacy/pserver/BaseClient.h            |  311 --
 paddle/legacy/pserver/CMakeLists.txt          |   56 -
 paddle/legacy/pserver/LightNetwork.cpp        |  459 --
 paddle/legacy/pserver/LightNetwork.h          |  185 -
 paddle/legacy/pserver/ParameterClient2.cpp    |  781 ---
 paddle/legacy/pserver/ParameterClient2.h      |  602 ---
 paddle/legacy/pserver/ParameterServer2.cpp    | 1401 -----
 paddle/legacy/pserver/ParameterServer2.h      |  696 ---
 .../legacy/pserver/ParameterServer2Main.cpp   |   29 -
 .../pserver/ParameterServerController.cpp     |  102 -
 .../pserver/ParameterServerController.h       |   74 -
 paddle/legacy/pserver/ProtoServer.cpp         |   74 -
 paddle/legacy/pserver/ProtoServer.h           |  267 -
 paddle/legacy/pserver/RDMANetwork.h           |  158 -
 paddle/legacy/pserver/SocketChannel.cpp       |  235 -
 paddle/legacy/pserver/SocketChannel.h         |  153 -
 .../pserver/SparseParameterDistribution.cpp   |  123 -
 .../pserver/SparseParameterDistribution.h     |   52 -
 paddle/legacy/pserver/test/.gitignore         |    5 -
 paddle/legacy/pserver/test/CMakeLists.txt     |   28 -
 paddle/legacy/pserver/test/SocketTest.cpp     |  256 -
 .../pserver/test/test_ParameterServer2.cpp    |  624 ---
 .../legacy/pserver/test/test_ProtoServer.cpp  |  169 -
 .../legacy/pserver/test/test_ProtoServer.sh   |   33 -
 paddle/legacy/trainer/CMakeLists.txt          |   73 -
 paddle/legacy/trainer/MergeModel.cpp          |   64 -
 .../trainer/NewRemoteParameterUpdater.cpp     |  150 -
 .../trainer/NewRemoteParameterUpdater.h       |  121 -
 paddle/legacy/trainer/ParamUtil.cpp           |  163 -
 paddle/legacy/trainer/ParamUtil.h             |  125 -
 paddle/legacy/trainer/ParameterUpdater.cpp    |  152 -
 paddle/legacy/trainer/ParameterUpdater.h      |  265 -
 .../legacy/trainer/RemoteParameterUpdater.cpp |  843 ---
 .../legacy/trainer/RemoteParameterUpdater.h   |  416 --
 paddle/legacy/trainer/Tester.cpp              |  380 --
 paddle/legacy/trainer/Tester.h                |  149 -
 paddle/legacy/trainer/TesterConfig.h          |  138 -
 .../legacy/trainer/ThreadParameterUpdater.cpp |  309 --
 .../legacy/trainer/ThreadParameterUpdater.h   |   85 -
 paddle/legacy/trainer/Trainer.cpp             |  653 ---
 paddle/legacy/trainer/Trainer.h               |  204 -
 paddle/legacy/trainer/TrainerBenchmark.cpp    |   71 -
 paddle/legacy/trainer/TrainerConfigHelper.cpp |  199 -
 paddle/legacy/trainer/TrainerConfigHelper.h   |  205 -
 paddle/legacy/trainer/TrainerInternal.cpp     |  303 --
 paddle/legacy/trainer/TrainerInternal.h       |  139 -
 .../legacy/trainer/TrainerInternalConfig.cpp  |   49 -
 paddle/legacy/trainer/TrainerInternalConfig.h |  233 -
 paddle/legacy/trainer/TrainerMain.cpp         |   65 -
 paddle/legacy/trainer/tests/.gitignore        |    3 -
 paddle/legacy/trainer/tests/CMakeLists.txt    |   41 -
 paddle/legacy/trainer/tests/__init__.py       |   13 -
 .../trainer/tests/config_parser_test.py       |   23 -
 .../legacy/trainer/tests/fake_file_list.list  |    1 -
 paddle/legacy/trainer/tests/picojson.h        | 1103 ----
 .../test_pydata_provider_wrapper.data         |    2 -
 .../test_pydata_provider_wrapper.list         |    1 -
 .../tests/rnn_gen_test_model_dir/r1.test.beam |   60 -
 .../tests/rnn_gen_test_model_dir/r1.test.nest |   16 -
 .../rnn_gen_test_model_dir/r1.test.nobeam     |   16 -
 .../rnn_gen_test_model_dir/t1/transtable      |  Bin 116 -> 0 bytes
 .../tests/rnn_gen_test_model_dir/t1/wordvec   |  Bin 116 -> 0 bytes
 paddle/legacy/trainer/tests/sample_data.txt   |   10 -
 .../legacy/trainer/tests/sample_filelist.txt  |    1 -
 .../trainer/tests/sample_trainer_config.conf  |   87 -
 .../tests/sample_trainer_config_hsigmoid.conf |   53 -
 .../tests/sample_trainer_config_parallel.conf |   86 -
 .../tests/sample_trainer_nest_rnn_gen.conf    |   73 -
 .../trainer/tests/sample_trainer_rnn_gen.conf |   66 -
 .../tests/simple_sparse_neural_network.py     |   37 -
 .../tests/simple_sparse_neural_network_dp.py  |   35 -
 .../legacy/trainer/tests/testPyDataWrapper.py |  130 -
 paddle/legacy/trainer/tests/test_Compare.cpp  |  158 -
 .../tests/test_PyDataProviderWrapper.cpp      |  220 -
 paddle/legacy/trainer/tests/test_Trainer.cpp  |  107 -
 .../trainer/tests/test_TrainerOnePass.cpp     |  318 --
 paddle/legacy/trainer/tests/test_config.conf  |   77 -
 paddle/legacy/trainer/tests/test_gen_dict.txt |    9 -
 .../test_recurrent_machine_generation.cpp     |  157 -
 paddle/legacy/utils/.gitignore                |    1 -
 paddle/legacy/utils/Any.h                     |   35 -
 paddle/legacy/utils/CMakeLists.txt            |   20 -
 paddle/legacy/utils/ClassRegistrar.h          |   81 -
 paddle/legacy/utils/Common.h                  |   35 -
 paddle/legacy/utils/CpuId.cpp                 |   66 -
 paddle/legacy/utils/CpuId.h                   |  136 -
 paddle/legacy/utils/CustomStackTrace.cpp      |   59 -
 paddle/legacy/utils/CustomStackTrace.h        |  193 -
 paddle/legacy/utils/DynamicLoader.cpp         |  170 -
 paddle/legacy/utils/DynamicLoader.h           |   68 -
 paddle/legacy/utils/Error.h                   |  145 -
 paddle/legacy/utils/Excepts.h                 |   28 -
 paddle/legacy/utils/Flags.cpp                 |   91 -
 paddle/legacy/utils/Flags.h                   |   44 -
 paddle/legacy/utils/GlobalConstants.cpp       |   23 -
 paddle/legacy/utils/GlobalConstants.h         |   97 -
 paddle/legacy/utils/Locks.h                   |  242 -
 paddle/legacy/utils/Logging.cpp               |   47 -
 paddle/legacy/utils/Logging.h                 |   46 -
 paddle/legacy/utils/PythonUtil.cpp            |  215 -
 paddle/legacy/utils/PythonUtil.h              |  381 --
 paddle/legacy/utils/Queue.h                   |  255 -
 paddle/legacy/utils/Stat.cpp                  |  165 -
 paddle/legacy/utils/Stat.h                    |  302 --
 paddle/legacy/utils/StringUtil.cpp            |   57 -
 paddle/legacy/utils/StringUtil.h              |  105 -
 paddle/legacy/utils/Thread.h                  |  615 ---
 paddle/legacy/utils/ThreadLocal.cpp           |   61 -
 paddle/legacy/utils/ThreadLocal.h             |  231 -
 paddle/legacy/utils/Util.cpp                  |  409 --
 paddle/legacy/utils/Util.h                    |  597 --
 paddle/legacy/utils/Version.cpp               |   60 -
 paddle/legacy/utils/Version.h                 |  131 -
 paddle/legacy/utils/arch/linux/Locks.cpp      |  149 -
 paddle/legacy/utils/arch/osx/Excepts.cpp      |   57 -
 paddle/legacy/utils/arch/osx/Locks.cpp        |  105 -
 paddle/legacy/utils/enable_virtualenv.py      |   26 -
 paddle/legacy/utils/tests/CMakeLists.txt      |   18 -
 .../utils/tests/test_CustomStackTrace.cpp     |   92 -
 .../tests/test_CustomStackTracePrint.cpp      |   30 -
 .../utils/tests/test_CustomStackTracePrint.sh |   15 -
 paddle/legacy/utils/tests/test_Error.cpp      |   34 -
 paddle/legacy/utils/tests/test_SIMDFlags.cpp  |   48 -
 paddle/legacy/utils/tests/test_SpinLock.cpp   |   55 -
 .../legacy/utils/tests/test_StringUtils.cpp   |   23 -
 paddle/legacy/utils/tests/test_Thread.cpp     |   81 -
 .../legacy/utils/tests/test_ThreadBarrier.cpp |   66 -
 797 files changed, 151956 deletions(-)
 delete mode 100644 paddle/legacy/api/Arguments.cpp
 delete mode 100644 paddle/legacy/api/CMakeLists.txt
 delete mode 100644 paddle/legacy/api/ConfigParser.cpp
 delete mode 100644 paddle/legacy/api/Evaluator.cpp
 delete mode 100644 paddle/legacy/api/GradientMachine.cpp
 delete mode 100644 paddle/legacy/api/Internal.h
 delete mode 100644 paddle/legacy/api/Matrix.cpp
 delete mode 100644 paddle/legacy/api/Paddle.i
 delete mode 100644 paddle/legacy/api/PaddleAPI.h
 delete mode 100644 paddle/legacy/api/PaddleAPIPrivate.h
 delete mode 100644 paddle/legacy/api/Parameter.cpp
 delete mode 100644 paddle/legacy/api/ParameterOptimizer.cpp
 delete mode 100644 paddle/legacy/api/ParameterUpdater.cpp
 delete mode 100644 paddle/legacy/api/SequenceGenerator.cpp
 delete mode 100644 paddle/legacy/api/Trainer.cpp
 delete mode 100644 paddle/legacy/api/Util.cpp
 delete mode 100644 paddle/legacy/api/Vector.cpp
 delete mode 100644 paddle/legacy/api/__init__.py
 delete mode 100644 paddle/legacy/api/numpy.i
 delete mode 100644 paddle/legacy/api/test/.gitignore
 delete mode 100644 paddle/legacy/api/test/CMakeLists.txt
 delete mode 100644 paddle/legacy/api/test/testArguments.py
 delete mode 100644 paddle/legacy/api/test/testGradientMachine.py
 delete mode 100644 paddle/legacy/api/test/testMatrix.py
 delete mode 100644 paddle/legacy/api/test/testTrain.py
 delete mode 100644 paddle/legacy/api/test/testTrainConfig.py
 delete mode 100644 paddle/legacy/api/test/testTrainer.py
 delete mode 100644 paddle/legacy/api/test/testVector.py
 delete mode 100644 paddle/legacy/api/test/util.py
 delete mode 100644 paddle/legacy/capi/Arguments.cpp
 delete mode 100644 paddle/legacy/capi/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/Main.cpp
 delete mode 100644 paddle/legacy/capi/Matrix.cpp
 delete mode 100644 paddle/legacy/capi/Vector.cpp
 delete mode 100644 paddle/legacy/capi/arguments.h
 delete mode 100644 paddle/legacy/capi/capi.h
 delete mode 100644 paddle/legacy/capi/capi_private.h
 delete mode 100644 paddle/legacy/capi/config.h.in
 delete mode 100644 paddle/legacy/capi/error.cpp
 delete mode 100644 paddle/legacy/capi/error.h
 delete mode 100644 paddle/legacy/capi/examples/.gitignore
 delete mode 100644 paddle/legacy/capi/examples/README.md
 delete mode 100644 paddle/legacy/capi/examples/model_inference/README.md
 delete mode 100644 paddle/legacy/capi/examples/model_inference/common/common.h
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
 delete mode 100755 paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/main.c
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/main.c
 delete mode 100644 paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
 delete mode 100755 paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/.gitignore
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/main.c
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
 delete mode 100644 paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
 delete mode 100755 paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
 delete mode 100644 paddle/legacy/capi/gradient_machine.cpp
 delete mode 100644 paddle/legacy/capi/gradient_machine.h
 delete mode 100644 paddle/legacy/capi/main.h
 delete mode 100644 paddle/legacy/capi/matrix.h
 delete mode 100644 paddle/legacy/capi/paddle_capi.map
 delete mode 100644 paddle/legacy/capi/tests/.gitignore
 delete mode 100644 paddle/legacy/capi/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/capi/tests/test_Arguments.cpp
 delete mode 100644 paddle/legacy/capi/tests/test_GradientMachine.cpp
 delete mode 100644 paddle/legacy/capi/tests/test_Matrix.cpp
 delete mode 100644 paddle/legacy/capi/tests/test_Vector.cpp
 delete mode 100644 paddle/legacy/capi/tests/test_predict_network.py
 delete mode 100644 paddle/legacy/capi/vector.h
 delete mode 100755 paddle/legacy/cuda/CMakeLists.txt
 delete mode 100644 paddle/legacy/cuda/include/hl_activation_functions.h
 delete mode 100644 paddle/legacy/cuda/include/hl_aggregate.h
 delete mode 100644 paddle/legacy/cuda/include/hl_avx_functions.h
 delete mode 100644 paddle/legacy/cuda/include/hl_base.h
 delete mode 100644 paddle/legacy/cuda/include/hl_batch_norm.h
 delete mode 100644 paddle/legacy/cuda/include/hl_batch_transpose.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cnn.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_gru.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_lstm.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_scalar.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda.ph
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda_cublas.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda_cudnn.h
 delete mode 100644 paddle/legacy/cuda/include/hl_cuda_cudnn.ph
 delete mode 100755 paddle/legacy/cuda/include/hl_device_functions.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_functions.h
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu.h
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu_functions.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu_gru.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu_lstm.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_gru_ops.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_lstm.h
 delete mode 100644 paddle/legacy/cuda/include/hl_lstm_ops.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix.h
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_apply.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_base.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_ops.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_matrix_type.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_perturbation_util.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_recurrent_apply.cuh
 delete mode 100644 paddle/legacy/cuda/include/hl_sequence.h
 delete mode 100644 paddle/legacy/cuda/include/hl_sparse.h
 delete mode 100644 paddle/legacy/cuda/include/hl_sparse.ph
 delete mode 100644 paddle/legacy/cuda/include/hl_table_apply.h
 delete mode 100644 paddle/legacy/cuda/include/hl_tensor_ops.h
 delete mode 100644 paddle/legacy/cuda/include/hl_thread.ph
 delete mode 100644 paddle/legacy/cuda/include/hl_time.h
 delete mode 100644 paddle/legacy/cuda/include/hl_top_k.h
 delete mode 100644 paddle/legacy/cuda/include/hl_warpctc_wrap.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_cnn_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_cuda_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_lstm_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_matrix_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_sequence_stub.h
 delete mode 100644 paddle/legacy/cuda/include/stub/hl_sparse_stub.h
 delete mode 100644 paddle/legacy/cuda/src/avx_mathfun.h
 delete mode 100644 paddle/legacy/cuda/src/hl_avx_functions.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_batch_norm.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_batch_transpose.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cpu_functions.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_aggregate.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_cnn.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_cublas.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_cudnn.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_device.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_lstm.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_matrix.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_sequence.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_sparse.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_cuda_sparse.cuh
 delete mode 100644 paddle/legacy/cuda/src/hl_math.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_perturbation_util.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_table_apply.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_time.cc
 delete mode 100644 paddle/legacy/cuda/src/hl_top_k.cu
 delete mode 100644 paddle/legacy/cuda/src/hl_warpctc_wrap.cc
 delete mode 100644 paddle/legacy/function/BlockExpandOp.cpp
 delete mode 100644 paddle/legacy/function/BlockExpandOpTest.cpp
 delete mode 100644 paddle/legacy/function/BufferArg.cpp
 delete mode 100644 paddle/legacy/function/BufferArg.h
 delete mode 100644 paddle/legacy/function/BufferArgTest.cpp
 delete mode 100644 paddle/legacy/function/CMakeLists.txt
 delete mode 100644 paddle/legacy/function/ContextProjectionOp.cpp
 delete mode 100644 paddle/legacy/function/ContextProjectionOp.h
 delete mode 100644 paddle/legacy/function/ContextProjectionOpGpu.cu
 delete mode 100644 paddle/legacy/function/ContextProjectionOpTest.cpp
 delete mode 100644 paddle/legacy/function/ConvOp.h
 delete mode 100644 paddle/legacy/function/ConvOpTest.h
 delete mode 100644 paddle/legacy/function/CosSimOp.cpp
 delete mode 100644 paddle/legacy/function/CosSimOp.h
 delete mode 100644 paddle/legacy/function/CosSimOpGpu.cu
 delete mode 100644 paddle/legacy/function/CosSimOpTest.cpp
 delete mode 100644 paddle/legacy/function/CropOp.cpp
 delete mode 100644 paddle/legacy/function/CropOp.h
 delete mode 100644 paddle/legacy/function/CropOpGpu.cu
 delete mode 100644 paddle/legacy/function/CropOpTest.cpp
 delete mode 100644 paddle/legacy/function/CrossMapNormalOp.cpp
 delete mode 100644 paddle/legacy/function/CrossMapNormalOp.h
 delete mode 100644 paddle/legacy/function/CrossMapNormalOpGpu.cu
 delete mode 100644 paddle/legacy/function/CrossMapNormalOpTest.cpp
 delete mode 100644 paddle/legacy/function/DepthwiseConvOp.cpp
 delete mode 100644 paddle/legacy/function/DepthwiseConvOp.h
 delete mode 100644 paddle/legacy/function/DepthwiseConvOpGpu.cu
 delete mode 100644 paddle/legacy/function/DepthwiseConvOpTest.cpp
 delete mode 100644 paddle/legacy/function/EigenGemm.cpp
 delete mode 100644 paddle/legacy/function/EigenThreadDevice.h
 delete mode 100644 paddle/legacy/function/Function.cpp
 delete mode 100644 paddle/legacy/function/Function.h
 delete mode 100644 paddle/legacy/function/FunctionTest.cpp
 delete mode 100644 paddle/legacy/function/FunctionTest.h
 delete mode 100644 paddle/legacy/function/GemmConvOp.cpp
 delete mode 100644 paddle/legacy/function/GemmConvOpTest.cpp
 delete mode 100644 paddle/legacy/function/GemmFunctor.cpp
 delete mode 100644 paddle/legacy/function/GemmFunctor.h
 delete mode 100644 paddle/legacy/function/GruFunctor.h
 delete mode 100644 paddle/legacy/function/Im2Col.h
 delete mode 100644 paddle/legacy/function/Im2ColOp.cpp
 delete mode 100644 paddle/legacy/function/Im2ColOpGpu.cu
 delete mode 100644 paddle/legacy/function/Im2ColTest.cpp
 delete mode 100644 paddle/legacy/function/MulOp.cpp
 delete mode 100644 paddle/legacy/function/MulOp.h
 delete mode 100644 paddle/legacy/function/MulOpGpu.cu
 delete mode 100644 paddle/legacy/function/MulOpTest.cpp
 delete mode 100644 paddle/legacy/function/NaiveConvOp.cpp
 delete mode 100644 paddle/legacy/function/PadOp.cpp
 delete mode 100644 paddle/legacy/function/PadOp.h
 delete mode 100644 paddle/legacy/function/PadOpGpu.cu
 delete mode 100644 paddle/legacy/function/PadOpTest.cpp
 delete mode 100644 paddle/legacy/function/RowConvOp.cpp
 delete mode 100644 paddle/legacy/function/RowConvOp.h
 delete mode 100644 paddle/legacy/function/RowConvOpGpu.cu
 delete mode 100644 paddle/legacy/function/RowConvOpTest.cpp
 delete mode 100644 paddle/legacy/function/ScaleSubRegionOp.cpp
 delete mode 100644 paddle/legacy/function/ScaleSubRegionOp.h
 delete mode 100644 paddle/legacy/function/ScaleSubRegionOpGpu.cu
 delete mode 100644 paddle/legacy/function/ScaleSubRegionOpTest.cpp
 delete mode 100644 paddle/legacy/function/SwitchOp.cpp
 delete mode 100644 paddle/legacy/function/SwitchOp.h
 delete mode 100644 paddle/legacy/function/SwitchOpGpu.cu
 delete mode 100644 paddle/legacy/function/SwitchOpTest.cpp
 delete mode 100644 paddle/legacy/function/TensorShape.h
 delete mode 100644 paddle/legacy/function/TensorShapeTest.cpp
 delete mode 100644 paddle/legacy/function/TensorType.h
 delete mode 100644 paddle/legacy/function/TensorTypeTest.cpp
 delete mode 100644 paddle/legacy/function/neon/NeonDepthwiseConv.cpp
 delete mode 100644 paddle/legacy/function/neon/NeonDepthwiseConv.h
 delete mode 100644 paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
 delete mode 100644 paddle/legacy/function/neon/neon_util.h
 delete mode 100644 paddle/legacy/function/nnpack/NNPACKConvOp.cpp
 delete mode 100644 paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
 delete mode 100644 paddle/legacy/gserver/CMakeLists.txt
 delete mode 100644 paddle/legacy/gserver/activations/ActivationFunction.cpp
 delete mode 100644 paddle/legacy/gserver/activations/ActivationFunction.h
 delete mode 100644 paddle/legacy/gserver/activations/MKLDNNActivation.cpp
 delete mode 100644 paddle/legacy/gserver/activations/MKLDNNActivation.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/DataProvider.cpp
 delete mode 100644 paddle/legacy/gserver/dataproviders/DataProvider.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/DataProviderGroup.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
 delete mode 100644 paddle/legacy/gserver/dataproviders/MultiDataProvider.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/ProtoReader.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
 delete mode 100644 paddle/legacy/gserver/dataproviders/PyDataProvider.h
 delete mode 100644 paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/Evaluator.cpp
 delete mode 100644 paddle/legacy/gserver/evaluators/Evaluator.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/GradientMachine.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/MultiNetwork.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
 delete mode 100644 paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
 delete mode 100644 paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
 delete mode 100644 paddle/legacy/gserver/layers/AddtoLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/AddtoLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/AgentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/AgentLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/AverageLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/AverageLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/BatchNormBaseLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/BatchNormalizationLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/BilinearInterpLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/BlockExpandLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/BlockExpandLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CRFDecodingLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CRFLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CRFLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CTCLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CTCLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ClipLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConcatenateLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ContextProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ContextProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/Conv3DLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Conv3DLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseOperator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseOperator.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvBaseProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvOperator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvOperator.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvShiftLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvTransOperator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvTransOperator.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvTransProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ConvTransProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CosSimLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CosSimLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CostLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CostLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CropLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CropLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
 delete mode 100644 paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/CudnnPoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DataLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DataLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DataNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DataNormLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DeConv3DLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DeConv3DLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DetectionOutputLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/DetectionUtil.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DetectionUtil.h
 delete mode 100644 paddle/legacy/gserver/layers/DotMulOperator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DotMulProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/DotProdLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ExpandConvLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ExpandConvLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ExpandLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ExpandLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/FactorizationMachineLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/FullMatrixProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/FullMatrixProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/FullyConnectedLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/GatedRecurrentLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/GetOutputLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/GruCompute.cpp
 delete mode 100644 paddle/legacy/gserver/layers/GruCompute.cu
 delete mode 100644 paddle/legacy/gserver/layers/GruCompute.h
 delete mode 100644 paddle/legacy/gserver/layers/GruStepLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/IdentityProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/InterpolationLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/L2DistanceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/L2DistanceLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/Layer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Layer.h
 delete mode 100644 paddle/legacy/gserver/layers/LinearChainCRF.cpp
 delete mode 100644 paddle/legacy/gserver/layers/LinearChainCRF.h
 delete mode 100644 paddle/legacy/gserver/layers/LinearChainCTC.cpp
 delete mode 100644 paddle/legacy/gserver/layers/LinearChainCTC.h
 delete mode 100644 paddle/legacy/gserver/layers/LstmCompute.cpp
 delete mode 100644 paddle/legacy/gserver/layers/LstmCompute.cu
 delete mode 100644 paddle/legacy/gserver/layers/LstmCompute.h
 delete mode 100644 paddle/legacy/gserver/layers/LstmLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/LstmLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/LstmStepLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MDLstmLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNBase.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNConvLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNFcLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MKLPackedWeight.h
 delete mode 100644 paddle/legacy/gserver/layers/MaxIdLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MaxLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MaxLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MaxOutLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MaxOutLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MixedLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MixedLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MultiBoxLossLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/MultinomialSampler.cpp
 delete mode 100644 paddle/legacy/gserver/layers/MultinomialSampler.h
 delete mode 100644 paddle/legacy/gserver/layers/MultiplexLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/NCELayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/NormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/NormLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/NormProjectionLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/NormProjectionLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/Operator.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Operator.h
 delete mode 100644 paddle/legacy/gserver/layers/OuterProdLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PadLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PadLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ParameterReluLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ParameterReluLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/Pool3DLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Pool3DLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/PoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/PoolProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PoolProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PoolProjectionLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/PowerLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PrintLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/PriorBox.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Projection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/Projection.h
 delete mode 100644 paddle/legacy/gserver/layers/ROIPoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ROIPoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/RecurrentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/RecurrentLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ResizeLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/RotateLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/RotateLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/RowConvLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/RowConvLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/RowL2NormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SamplingIdLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ScalingLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ScalingProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequencePoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequencePoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequenceToBatch.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SequenceToBatch.h
 delete mode 100644 paddle/legacy/gserver/layers/SliceProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SubSequenceLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/SwitchOrderLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/TableProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/TableProjection.h
 delete mode 100644 paddle/legacy/gserver/layers/TensorLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/TensorLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/TransLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/TransLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
 delete mode 100644 paddle/legacy/gserver/layers/UpsampleLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/UpsampleLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/ValidationLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/ValidationLayer.h
 delete mode 100644 paddle/legacy/gserver/layers/WarpCTCLayer.cpp
 delete mode 100644 paddle/legacy/gserver/layers/WarpCTCLayer.h
 delete mode 100644 paddle/legacy/gserver/tests/.gitignore
 delete mode 100644 paddle/legacy/gserver/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/gserver/tests/LayerGradUtil.cpp
 delete mode 100644 paddle/legacy/gserver/tests/LayerGradUtil.h
 delete mode 100644 paddle/legacy/gserver/tests/MKLDNNTester.cpp
 delete mode 100644 paddle/legacy/gserver/tests/MKLDNNTester.h
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/dummy.list
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/train.list
 delete mode 100644 paddle/legacy/gserver/tests/Sequence/train.list.nest
 delete mode 100644 paddle/legacy/gserver/tests/__init__.py
 delete mode 100644 paddle/legacy/gserver/tests/concat_dotmul_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_dotmul_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_slice_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_slice_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_table_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/concat_table_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_c.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_cudnn.py
 delete mode 100644 paddle/legacy/gserver/tests/img_conv_exconv.py
 delete mode 100644 paddle/legacy/gserver/tests/img_pool_a.conf
 delete mode 100644 paddle/legacy/gserver/tests/img_pool_b.conf
 delete mode 100644 paddle/legacy/gserver/tests/mkldnn_branch_net.conf
 delete mode 100644 paddle/legacy/gserver/tests/mkldnn_simple_net.conf
 delete mode 100644 paddle/legacy/gserver/tests/pyDataProvider.py
 delete mode 100644 paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
 delete mode 100644 paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
 delete mode 100644 paddle/legacy/gserver/tests/rnn_data_provider.py
 delete mode 100644 paddle/legacy/gserver/tests/sequenceGen.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_layer_group.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_lstm.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_nest_rnn.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_recurrent.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_recurrent_group.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
 delete mode 100644 paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
 delete mode 100644 paddle/legacy/gserver/tests/test_ActivationGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_BatchNorm.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_CompareSparse.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_ConvTrans.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_ConvUnify.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_DetectionOutput.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_Evaluator.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_Expand.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_LayerGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_MKLDNN.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_NetworkCompare.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_PriorBox.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_PyDataProvider.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_PyDataProvider2.py
 delete mode 100644 paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_Upsample.cpp
 delete mode 100644 paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
 delete mode 100644 paddle/legacy/math/Allocator.h
 delete mode 100644 paddle/legacy/math/BaseMatrix.cu
 delete mode 100644 paddle/legacy/math/BaseMatrix.h
 delete mode 100644 paddle/legacy/math/CMakeLists.txt
 delete mode 100644 paddle/legacy/math/CpuSparseMatrix.cpp
 delete mode 100644 paddle/legacy/math/CpuSparseMatrix.h
 delete mode 100644 paddle/legacy/math/ExecViaCpu.h
 delete mode 100644 paddle/legacy/math/MKLDNNMatrix.cpp
 delete mode 100644 paddle/legacy/math/MKLDNNMatrix.h
 delete mode 100644 paddle/legacy/math/MathFunctions.cpp
 delete mode 100644 paddle/legacy/math/MathFunctions.h
 delete mode 100644 paddle/legacy/math/MathUtils.cpp
 delete mode 100644 paddle/legacy/math/MathUtils.h
 delete mode 100644 paddle/legacy/math/Matrix.cpp
 delete mode 100644 paddle/legacy/math/Matrix.h
 delete mode 100644 paddle/legacy/math/MatrixBitCode.cpp
 delete mode 100644 paddle/legacy/math/MemoryHandle.cpp
 delete mode 100644 paddle/legacy/math/MemoryHandle.h
 delete mode 100644 paddle/legacy/math/NEONFunctions.cpp
 delete mode 100644 paddle/legacy/math/NEONFunctions.h
 delete mode 100644 paddle/legacy/math/PoolAllocator.cpp
 delete mode 100644 paddle/legacy/math/PoolAllocator.h
 delete mode 100644 paddle/legacy/math/RowBuffer.h
 delete mode 100644 paddle/legacy/math/SIMDFunctions.cpp
 delete mode 100644 paddle/legacy/math/SIMDFunctions.h
 delete mode 100644 paddle/legacy/math/SparseMatrix.cpp
 delete mode 100644 paddle/legacy/math/SparseMatrix.h
 delete mode 100644 paddle/legacy/math/SparseRowMatrix.cpp
 delete mode 100644 paddle/legacy/math/SparseRowMatrix.h
 delete mode 100644 paddle/legacy/math/Storage.cpp
 delete mode 100644 paddle/legacy/math/Storage.h
 delete mode 100644 paddle/legacy/math/TensorApply.h
 delete mode 100644 paddle/legacy/math/TensorAssign.h
 delete mode 100644 paddle/legacy/math/TensorEvaluate.h
 delete mode 100644 paddle/legacy/math/TensorExpression.h
 delete mode 100644 paddle/legacy/math/TrainingAlgorithmOp.cu
 delete mode 100644 paddle/legacy/math/TrainingAlgorithmOp.h
 delete mode 100644 paddle/legacy/math/Vector.cpp
 delete mode 100644 paddle/legacy/math/Vector.h
 delete mode 100644 paddle/legacy/math/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/math/tests/OriginalOptimizerApi.h
 delete mode 100644 paddle/legacy/math/tests/PerfUtils.h
 delete mode 100644 paddle/legacy/math/tests/TensorCheck.h
 delete mode 100644 paddle/legacy/math/tests/TestUtils.h
 delete mode 100644 paddle/legacy/math/tests/test_Allocator.cpp
 delete mode 100644 paddle/legacy/math/tests/test_BaseMatrix.cpp
 delete mode 100644 paddle/legacy/math/tests/test_CpuGpuVector.cpp
 delete mode 100644 paddle/legacy/math/tests/test_ExecViaCpu.cpp
 delete mode 100644 paddle/legacy/math/tests/test_FPException.cpp
 delete mode 100644 paddle/legacy/math/tests/test_GpuProfiler.cpp
 delete mode 100644 paddle/legacy/math/tests/test_Matrix.cpp
 delete mode 100644 paddle/legacy/math/tests/test_RowBuffer.cpp
 delete mode 100644 paddle/legacy/math/tests/test_SIMDFunctions.cpp
 delete mode 100644 paddle/legacy/math/tests/test_SparseMatrix.cpp
 delete mode 100644 paddle/legacy/math/tests/test_Tensor.cu
 delete mode 100644 paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
 delete mode 100644 paddle/legacy/math/tests/test_batchTranspose.cpp
 delete mode 100644 paddle/legacy/math/tests/test_lazyAssign.cu
 delete mode 100644 paddle/legacy/math/tests/test_matrixCompare.cpp
 delete mode 100644 paddle/legacy/math/tests/test_matrixUtil.h
 delete mode 100644 paddle/legacy/math/tests/test_perturbation.cpp
 delete mode 100644 paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
 delete mode 100644 paddle/legacy/optimizer/CMakeLists.txt
 delete mode 100644 paddle/legacy/optimizer/adadelta_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/adadelta_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/adagrad_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/adagrad_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/adam_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/adam_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/lr_policy.h
 delete mode 100644 paddle/legacy/optimizer/optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/optimizer.h
 delete mode 100644 paddle/legacy/optimizer/parameter_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/parameter_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/parameter_optimizer_test.cc
 delete mode 100644 paddle/legacy/optimizer/serialization.h
 delete mode 100644 paddle/legacy/optimizer/serialization_test.cc
 delete mode 100644 paddle/legacy/optimizer/sgd_optimizer.cc
 delete mode 100644 paddle/legacy/optimizer/sgd_optimizer.h
 delete mode 100644 paddle/legacy/optimizer/tensor.h
 delete mode 100644 paddle/legacy/parameter/Argument.cpp
 delete mode 100644 paddle/legacy/parameter/Argument.h
 delete mode 100644 paddle/legacy/parameter/AverageOptimizer.cpp
 delete mode 100644 paddle/legacy/parameter/AverageOptimizer.h
 delete mode 100644 paddle/legacy/parameter/CMakeLists.txt
 delete mode 100644 paddle/legacy/parameter/FirstOrderOptimizer.cpp
 delete mode 100644 paddle/legacy/parameter/FirstOrderOptimizer.h
 delete mode 100644 paddle/legacy/parameter/LearningRateScheduler.cpp
 delete mode 100644 paddle/legacy/parameter/LearningRateScheduler.h
 delete mode 100644 paddle/legacy/parameter/OptimizerFunctions.cpp
 delete mode 100644 paddle/legacy/parameter/OptimizerFunctions.h
 delete mode 100644 paddle/legacy/parameter/OptimizerWithRegularizer.cpp
 delete mode 100644 paddle/legacy/parameter/OptimizerWithRegularizer.h
 delete mode 100644 paddle/legacy/parameter/Parameter.cpp
 delete mode 100644 paddle/legacy/parameter/Parameter.h
 delete mode 100644 paddle/legacy/parameter/ParameterOptimizer.cpp
 delete mode 100644 paddle/legacy/parameter/ParameterOptimizer.h
 delete mode 100644 paddle/legacy/parameter/ParameterUpdateFunctions.cpp
 delete mode 100644 paddle/legacy/parameter/ParameterUpdateFunctions.h
 delete mode 100644 paddle/legacy/parameter/ParameterUpdaterBase.cpp
 delete mode 100644 paddle/legacy/parameter/ParameterUpdaterBase.h
 delete mode 100644 paddle/legacy/parameter/ParameterUpdaterHook.cpp
 delete mode 100644 paddle/legacy/parameter/ParameterUpdaterHook.h
 delete mode 100644 paddle/legacy/parameter/Regularizer.cpp
 delete mode 100644 paddle/legacy/parameter/Regularizer.h
 delete mode 100644 paddle/legacy/parameter/ThreadLocalBuffer.cpp
 delete mode 100644 paddle/legacy/parameter/ThreadLocalBuffer.h
 delete mode 100644 paddle/legacy/parameter/Weight.cpp
 delete mode 100644 paddle/legacy/parameter/Weight.h
 delete mode 100644 paddle/legacy/parameter/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/parameter/tests/test_argument.cpp
 delete mode 100644 paddle/legacy/parameter/tests/test_common.cpp
 delete mode 100644 paddle/legacy/pserver/BaseClient.cpp
 delete mode 100644 paddle/legacy/pserver/BaseClient.h
 delete mode 100644 paddle/legacy/pserver/CMakeLists.txt
 delete mode 100644 paddle/legacy/pserver/LightNetwork.cpp
 delete mode 100644 paddle/legacy/pserver/LightNetwork.h
 delete mode 100644 paddle/legacy/pserver/ParameterClient2.cpp
 delete mode 100644 paddle/legacy/pserver/ParameterClient2.h
 delete mode 100644 paddle/legacy/pserver/ParameterServer2.cpp
 delete mode 100644 paddle/legacy/pserver/ParameterServer2.h
 delete mode 100644 paddle/legacy/pserver/ParameterServer2Main.cpp
 delete mode 100644 paddle/legacy/pserver/ParameterServerController.cpp
 delete mode 100644 paddle/legacy/pserver/ParameterServerController.h
 delete mode 100644 paddle/legacy/pserver/ProtoServer.cpp
 delete mode 100644 paddle/legacy/pserver/ProtoServer.h
 delete mode 100644 paddle/legacy/pserver/RDMANetwork.h
 delete mode 100644 paddle/legacy/pserver/SocketChannel.cpp
 delete mode 100644 paddle/legacy/pserver/SocketChannel.h
 delete mode 100644 paddle/legacy/pserver/SparseParameterDistribution.cpp
 delete mode 100644 paddle/legacy/pserver/SparseParameterDistribution.h
 delete mode 100644 paddle/legacy/pserver/test/.gitignore
 delete mode 100644 paddle/legacy/pserver/test/CMakeLists.txt
 delete mode 100644 paddle/legacy/pserver/test/SocketTest.cpp
 delete mode 100644 paddle/legacy/pserver/test/test_ParameterServer2.cpp
 delete mode 100644 paddle/legacy/pserver/test/test_ProtoServer.cpp
 delete mode 100755 paddle/legacy/pserver/test/test_ProtoServer.sh
 delete mode 100644 paddle/legacy/trainer/CMakeLists.txt
 delete mode 100644 paddle/legacy/trainer/MergeModel.cpp
 delete mode 100644 paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
 delete mode 100644 paddle/legacy/trainer/NewRemoteParameterUpdater.h
 delete mode 100644 paddle/legacy/trainer/ParamUtil.cpp
 delete mode 100644 paddle/legacy/trainer/ParamUtil.h
 delete mode 100644 paddle/legacy/trainer/ParameterUpdater.cpp
 delete mode 100644 paddle/legacy/trainer/ParameterUpdater.h
 delete mode 100644 paddle/legacy/trainer/RemoteParameterUpdater.cpp
 delete mode 100644 paddle/legacy/trainer/RemoteParameterUpdater.h
 delete mode 100644 paddle/legacy/trainer/Tester.cpp
 delete mode 100644 paddle/legacy/trainer/Tester.h
 delete mode 100644 paddle/legacy/trainer/TesterConfig.h
 delete mode 100644 paddle/legacy/trainer/ThreadParameterUpdater.cpp
 delete mode 100644 paddle/legacy/trainer/ThreadParameterUpdater.h
 delete mode 100644 paddle/legacy/trainer/Trainer.cpp
 delete mode 100644 paddle/legacy/trainer/Trainer.h
 delete mode 100644 paddle/legacy/trainer/TrainerBenchmark.cpp
 delete mode 100644 paddle/legacy/trainer/TrainerConfigHelper.cpp
 delete mode 100644 paddle/legacy/trainer/TrainerConfigHelper.h
 delete mode 100644 paddle/legacy/trainer/TrainerInternal.cpp
 delete mode 100644 paddle/legacy/trainer/TrainerInternal.h
 delete mode 100644 paddle/legacy/trainer/TrainerInternalConfig.cpp
 delete mode 100644 paddle/legacy/trainer/TrainerInternalConfig.h
 delete mode 100644 paddle/legacy/trainer/TrainerMain.cpp
 delete mode 100644 paddle/legacy/trainer/tests/.gitignore
 delete mode 100644 paddle/legacy/trainer/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/trainer/tests/__init__.py
 delete mode 100644 paddle/legacy/trainer/tests/config_parser_test.py
 delete mode 100644 paddle/legacy/trainer/tests/fake_file_list.list
 delete mode 100644 paddle/legacy/trainer/tests/picojson.h
 delete mode 100644 paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
 delete mode 100644 paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
 delete mode 100644 paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
 delete mode 100644 paddle/legacy/trainer/tests/sample_data.txt
 delete mode 100644 paddle/legacy/trainer/tests/sample_filelist.txt
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_config.conf
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
 delete mode 100644 paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
 delete mode 100644 paddle/legacy/trainer/tests/simple_sparse_neural_network.py
 delete mode 100644 paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
 delete mode 100644 paddle/legacy/trainer/tests/testPyDataWrapper.py
 delete mode 100644 paddle/legacy/trainer/tests/test_Compare.cpp
 delete mode 100644 paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
 delete mode 100644 paddle/legacy/trainer/tests/test_Trainer.cpp
 delete mode 100644 paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
 delete mode 100644 paddle/legacy/trainer/tests/test_config.conf
 delete mode 100644 paddle/legacy/trainer/tests/test_gen_dict.txt
 delete mode 100644 paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
 delete mode 100644 paddle/legacy/utils/.gitignore
 delete mode 100644 paddle/legacy/utils/Any.h
 delete mode 100644 paddle/legacy/utils/CMakeLists.txt
 delete mode 100644 paddle/legacy/utils/ClassRegistrar.h
 delete mode 100644 paddle/legacy/utils/Common.h
 delete mode 100644 paddle/legacy/utils/CpuId.cpp
 delete mode 100644 paddle/legacy/utils/CpuId.h
 delete mode 100644 paddle/legacy/utils/CustomStackTrace.cpp
 delete mode 100644 paddle/legacy/utils/CustomStackTrace.h
 delete mode 100644 paddle/legacy/utils/DynamicLoader.cpp
 delete mode 100644 paddle/legacy/utils/DynamicLoader.h
 delete mode 100644 paddle/legacy/utils/Error.h
 delete mode 100644 paddle/legacy/utils/Excepts.h
 delete mode 100644 paddle/legacy/utils/Flags.cpp
 delete mode 100644 paddle/legacy/utils/Flags.h
 delete mode 100644 paddle/legacy/utils/GlobalConstants.cpp
 delete mode 100644 paddle/legacy/utils/GlobalConstants.h
 delete mode 100644 paddle/legacy/utils/Locks.h
 delete mode 100644 paddle/legacy/utils/Logging.cpp
 delete mode 100644 paddle/legacy/utils/Logging.h
 delete mode 100644 paddle/legacy/utils/PythonUtil.cpp
 delete mode 100644 paddle/legacy/utils/PythonUtil.h
 delete mode 100644 paddle/legacy/utils/Queue.h
 delete mode 100644 paddle/legacy/utils/Stat.cpp
 delete mode 100644 paddle/legacy/utils/Stat.h
 delete mode 100644 paddle/legacy/utils/StringUtil.cpp
 delete mode 100644 paddle/legacy/utils/StringUtil.h
 delete mode 100644 paddle/legacy/utils/Thread.h
 delete mode 100644 paddle/legacy/utils/ThreadLocal.cpp
 delete mode 100644 paddle/legacy/utils/ThreadLocal.h
 delete mode 100644 paddle/legacy/utils/Util.cpp
 delete mode 100644 paddle/legacy/utils/Util.h
 delete mode 100644 paddle/legacy/utils/Version.cpp
 delete mode 100644 paddle/legacy/utils/Version.h
 delete mode 100644 paddle/legacy/utils/arch/linux/Locks.cpp
 delete mode 100644 paddle/legacy/utils/arch/osx/Excepts.cpp
 delete mode 100644 paddle/legacy/utils/arch/osx/Locks.cpp
 delete mode 100644 paddle/legacy/utils/enable_virtualenv.py
 delete mode 100644 paddle/legacy/utils/tests/CMakeLists.txt
 delete mode 100644 paddle/legacy/utils/tests/test_CustomStackTrace.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
 delete mode 100755 paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
 delete mode 100644 paddle/legacy/utils/tests/test_Error.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_SIMDFlags.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_SpinLock.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_StringUtils.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_Thread.cpp
 delete mode 100644 paddle/legacy/utils/tests/test_ThreadBarrier.cpp

diff --git a/paddle/legacy/api/Arguments.cpp b/paddle/legacy/api/Arguments.cpp
deleted file mode 100644
index 7bb5a6f75..000000000
--- a/paddle/legacy/api/Arguments.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "paddle/legacy/parameter/Argument.h"
-
-size_t Arguments::getSlotNum() const { return m->outputs.size(); }
-
-Arguments* Arguments::createArguments(size_t slotNum) {
-  auto args = new Arguments();
-  args->m->outputs.resize(slotNum);
-  return args;
-}
-
-void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
-
-Arguments::Arguments() : m(new ArgumentsPrivate()) {}
-
-Arguments::~Arguments() { delete m; }
-
-Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
-  auto p = (std::vector<paddle::Argument>*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs = *p;
-  return args;
-}
-
-Arguments* Arguments::createByPaddleArgument(const void* ptr) {
-  auto p = (paddle::Argument*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs.push_back(*p);
-  return args;
-}
-
-Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.value);
-}
-
-Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.grad);
-}
-
-IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.ids);
-}
-
-Matrix* Arguments::getSlotIn(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.in);
-}
-
-void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIds(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.ids = v;
-}
-
-template <typename T1>
-static inline void doCopyFromSafely(std::shared_ptr<T1>& dest,
-                                    std::shared_ptr<T1>& src) {
-  if (src) {
-    if (dest) {
-      dest->copyFrom(*src);
-    } else {
-      dest = src;
-    }
-  }
-}
-
-IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.sequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.sequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.subSequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.subSequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-void Arguments::setSlotSequenceStartPositions(size_t idx,
-                                              IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.sequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-IVector* Arguments::getSlotSequenceDim(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.cpuSequenceDims);
-}
-
-void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
-}
-
-float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
-
-int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getBatchSize();
-}
-
-void Arguments::setSlotFrameHeight(size_t idx, size_t h) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameHeight(h);
-}
-
-void Arguments::setSlotFrameWidth(size_t idx, size_t w) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameWidth(w);
-}
-
-size_t Arguments::getSlotFrameHeight(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameHeight();
-}
-
-size_t Arguments::getSlotFrameWidth(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameWidth();
-}
-
-void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/legacy/api/CMakeLists.txt b/paddle/legacy/api/CMakeLists.txt
deleted file mode 100644
index 06e1f5d5f..000000000
--- a/paddle/legacy/api/CMakeLists.txt
+++ /dev/null
@@ -1,120 +0,0 @@
-set(API_SOURCES
-    Arguments.cpp
-    ConfigParser.cpp
-    Evaluator.cpp
-    GradientMachine.cpp
-    Matrix.cpp
-    Parameter.cpp
-    ParameterOptimizer.cpp
-    ParameterUpdater.cpp
-    SequenceGenerator.cpp
-    Trainer.cpp
-    Util.cpp
-    Vector.cpp)
-set(API_HEADER
-    PaddleAPI.h
-    Internal.h)
-
-add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
-
-INCLUDE(${SWIG_USE_FILE})
-INCLUDE_DIRECTORIES(${PADDLE_SOURCE_DIR}/paddle)
-
-FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
-
-SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
-
-SET(SWIG_NEED_FLAGS
-    -ftls-model=global-dynamic
-    -Wno-parentheses-equality
-    -Wno-self-assign
-    -Wno-maybe-uninitialized
-    -Wno-missing-field-initializers)
-  FOREACH(flag ${SWIG_NEED_FLAGS})
-  safe_set_cxxflag(SWIG_CXX_FLAGS ${flag})
-ENDFOREACH()
-
-SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}")
-
-SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
-    paddle_parameter
-    paddle_function
-    paddle_math
-    paddle_utils
-    paddle_gserver
-    paddle_pserver
-    paddle_api
-    paddle_cuda
-    paddle_trainer_lib
-    paddle_network
-    paddle_proto
-    ${external_project_dependencies}
-    ${RDMA_LIBS}
-)
-
-IF(APPLE)
-    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-ELSE(APPLE)
-    SET(START_GROUP "-Xlinker -start-group")
-    SET(END_GROUP "-Xlinker -end-group")
-    SET(ARCHIVE_START "-Wl,--whole-archive")
-    SET(ARCHIVE_END "-Wl,--no-whole-archive")
-ENDIF(APPLE)
-
-SWIG_ADD_MODULE(swig_paddle python Paddle.i)
-SWIG_LINK_LIBRARIES(swig_paddle
-    ${MACOS_LD_FLAGS}
-    ${START_GROUP}
-    ${ARCHIVE_START}
-    paddle_gserver
-    paddle_function
-    ${METRIC_LIBS}
-    ${ARCHIVE_END}
-    paddle_pserver
-    paddle_trainer_lib
-    paddle_network
-    paddle_parameter
-    paddle_optimizer
-    paddle_math
-    paddle_utils
-    paddle_proto
-    paddle_cuda
-    paddle_api
-    ${CMAKE_DL_LIBS}
-    ${EXTERNAL_LIBS}
-    ${CMAKE_THREAD_LIBS_INIT}
-    ${RDMA_LD_FLAGS}
-    ${START_END}
-)
-
-add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
-    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
-    DEPENDS _swig_paddle
-)
-
-# TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
-
-if(WITH_TESTING)
-    IF(NOT PY_PIP_FOUND)
-        SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
-        ExternalProject_Add(pip
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            GIT_REPOSITORY      https://github.com/pypa/pip.git
-            GIT_TAG             9.0.1
-            PREFIX              ${PIP_SOURCES_DIR}
-            CONFIGURE_COMMAND   ""
-            BUILD_COMMAND       ""
-            INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-            BUILD_IN_SOURCE     1
-            #DEPENDS python setuptools python_api_wheel
-        )
-    ENDIF()
-    add_subdirectory(test)
-endif()
diff --git a/paddle/legacy/api/ConfigParser.cpp b/paddle/legacy/api/ConfigParser.cpp
deleted file mode 100644
index 016d6da4e..000000000
--- a/paddle/legacy/api/ConfigParser.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-#include "paddle/legacy/trainer/Trainer.h"
-
-struct ParameterConfigPrivate {
-  paddle::ParameterPtr parameter;
-  paddle::ParameterConfig config;
-
-  inline paddle::ParameterConfig* getConfigPtr() {
-    if (parameter != nullptr) {
-      auto& conf = parameter->getConfig();
-      return const_cast<paddle::ParameterConfig*>(&conf);
-    } else {
-      return &config;
-    }
-  }
-};
-
-TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}
-
-TrainerConfig::~TrainerConfig() { delete m; }
-
-TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
-    const std::string& confPath) {
-  LOG(INFO) << "load trainer config from " << confPath;
-  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
-  auto retv = new TrainerConfig();
-  retv->m->conf = conf;
-  return retv;
-}
-
-TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
-  auto retv = new TrainerConfig();
-  paddle::TrainerConfig trainerConfigProto;
-  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
-  CHECK(conf->getMutableConfig().ParseFromString(str));
-  retv->m->conf = conf;
-  return retv;
-}
-
-ModelConfig::ModelConfig() : m(new ModelConfigPrivate()) {}
-
-ModelConfig::~ModelConfig() { delete m; }
-
-ModelConfig* TrainerConfig::getModelConfig() const {
-  auto retv = new ModelConfig();
-  retv->m->conf = m->conf;
-  return retv;
-}
-
-ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
-
-ParameterConfig::~ParameterConfig() { delete m; }
-
-ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
-    void* ptr) {
-  auto& p = *(paddle::ParameterPtr*)(ptr);
-  if (p != nullptr) {
-    auto conf = new ParameterConfig();
-    conf->m->parameter = p;
-    return conf;
-  } else {
-    return nullptr;
-  }
-}
-
-ParameterConfig* ParameterConfig::createParameterConfigFromParameterPtr(
-    void* ptr) {
-  auto& p = *(paddle::Parameter*)(ptr);
-  auto conf = new ParameterConfig();
-  conf->m->config = p.getConfig();
-  return conf;
-}
-
-std::string ParameterConfig::toProtoString() const {
-  return m->getConfigPtr()->SerializeAsString();
-}
-
-void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
-
-OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
-
-OptimizationConfig::~OptimizationConfig() { delete m; }
-
-std::string OptimizationConfig::toProtoString() {
-  return m->getConfig().SerializeAsString();
-}
-
-OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
-  auto opt_config = new OptimizationConfig();
-  opt_config->m->trainer_config = m->conf;
-  return opt_config;
-}
-
-OptimizationConfig* OptimizationConfig::createFromProtoString(
-    const std::string& str) {
-  auto conf = new OptimizationConfig();
-  conf->m->config.ParseFromString(str);
-  return conf;
-}
diff --git a/paddle/legacy/api/Evaluator.cpp b/paddle/legacy/api/Evaluator.cpp
deleted file mode 100644
index c4aac47cb..000000000
--- a/paddle/legacy/api/Evaluator.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <sstream>
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-Evaluator::Evaluator() : m(new EvaluatorPrivate()) {}
-Evaluator::~Evaluator() { delete m; }
-
-void Evaluator::start() { m->rawPtr->start(); }
-
-void Evaluator::finish() { m->rawPtr->finish(); }
-
-std::string Evaluator::toString() {
-  std::ostringstream sout;
-  m->rawPtr->printStats(sout);
-  return sout.str();
-}
-
-std::vector<std::string> Evaluator::getNames() const {
-  std::vector<std::string> retv;
-  m->rawPtr->getNames(&retv);
-  return retv;
-}
-
-double Evaluator::getValue(const std::string name) const {
-  paddle::Error err;
-  double v = m->rawPtr->getValue(name, &err);
-  if (!err.isOK()) {
-    throw std::runtime_error(err.msg());
-  }
-  return v;
-}
diff --git a/paddle/legacy/api/GradientMachine.cpp b/paddle/legacy/api/GradientMachine.cpp
deleted file mode 100644
index 5ad2fe11a..000000000
--- a/paddle/legacy/api/GradientMachine.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "Internal.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-
-std::vector<int> GradientMachine::defaultParamTypes = {
-    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
-
-GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
-
-GradientMachine::~GradientMachine() { delete m; }
-
-GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    const void* confPtr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto& conf = *(const paddle::ModelConfig*)(confPtr);
-  std::vector<ParameterType> realTypes;
-  staticCastVector(&realTypes, types);
-  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
-  auto machinePtr = std::shared_ptr<paddle::GradientMachine>(machineRawPtr);
-  if (machinePtr != nullptr) {
-    auto machine = new GradientMachine();
-    machine->m->machine = machinePtr;
-    return machine;
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  paddle::ModelConfig conf;
-  conf.ParseFromString(protoStr);
-  if (conf.IsInitialized()) {
-    return GradientMachine::createFromPaddleModelPtr(&conf, mode, types);
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto confPtr = &conf->m->conf->getModelConfig();
-  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
-}
-
-void GradientMachine::start() { m->machine->start(); }
-
-void GradientMachine::finish() { m->machine->finish(); }
-
-void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
-
-void GradientMachine::prefetch(const Arguments& inArgs) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  m->machine->prefetch(in);
-}
-
-void GradientMachine::forward(const Arguments& inArgs,
-                              Arguments* outArgs,
-                              PassType passType) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forward(in, &out, pt);
-}
-
-UpdateCallback::~UpdateCallback() {}
-
-void UpdateCallback::apply(Parameter* p) {
-  // UNUSED(p);
-}
-
-class UpdateCallbackWrapper {
- public:
-  explicit UpdateCallbackWrapper(const UpdateCallback& callback)
-      : callback(const_cast<UpdateCallback&>(callback)) {}
-
-  void operator()(paddle::Parameter* param) {
-    auto p = Parameter::createFromRawPtr(&param);
-    // @TODO Use Stack variable instead.
-    callback.apply(p);
-    delete p;
-  }
-
- private:
-  UpdateCallback& callback;
-};
-
-void GradientMachine::backward(const UpdateCallback& callback) {
-  m->machine->backward(UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs,
-                                      PassType passType,
-                                      const UpdateCallback& callback) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forwardBackward(in, &out, pt, UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::loadParameters(const std::string& path) {
-  m->machine->loadParameters(path);
-}
-
-size_t GradientMachine::getParameterSize() const {
-  return m->machine->getParameters().size();
-}
-
-Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(&m->machine->getParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-size_t GradientMachine::getNonStaticParameterSize() const {
-  return m->machine->getNonStaticParameters().size();
-}
-
-Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getNonStaticParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(
-        &m->machine->getNonStaticParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-void GradientMachine::randParameters() { m->machine->randParameters(); }
-
-Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
-    throw(UnsupportError) {
-  auto nn = m->machine;
-  if (nn) {
-    auto arg = nn->getLayerOutput(layerName);
-    return Arguments::createByPaddleArgument(&arg);
-  } else {
-    throw UnsupportError();
-  }
-}
-
-SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector<std::string>& dict,
-    size_t begin_id,
-    size_t end_id,
-    size_t max_length,
-    size_t beam_size) {
-  SequenceGenerator* r =
-      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
-  r->setDict(dict);
-  r->setBos(begin_id);
-  r->setEos(end_id);
-  r->setMaxLength(max_length);
-  r->setBeamSize(beam_size);
-  return r;
-}
-
-Evaluator* GradientMachine::makeEvaluator() {
-  auto ev = new Evaluator();
-  ev->m->rawPtr = m->machine->makeEvaluator();
-  return ev;
-}
-
-void GradientMachine::eval(Evaluator* evaluator) {
-  m->machine->eval(evaluator->m->rawPtr);
-}
diff --git a/paddle/legacy/api/Internal.h b/paddle/legacy/api/Internal.h
deleted file mode 100644
index 2195cc673..000000000
--- a/paddle/legacy/api/Internal.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "PaddleAPI.h"
-
-#include <algorithm>
-#include <vector>
-
-template <typename T1, typename T2>
-void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
-  dest->resize(src.size());
-  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t) {
-    return static_cast<T2>(t);
-  });
-}
diff --git a/paddle/legacy/api/Matrix.cpp b/paddle/legacy/api/Matrix.cpp
deleted file mode 100644
index 8862d0ea9..000000000
--- a/paddle/legacy/api/Matrix.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/math/Matrix.h"
-#include <cstring>
-#include <iostream>
-#include "PaddleAPI.h"
-#include "paddle/legacy/math/CpuSparseMatrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-struct MatrixPrivate {
-  std::shared_ptr<paddle::Matrix> mat;
-};
-
-Matrix::Matrix() : m(new MatrixPrivate()) {}
-
-Matrix* Matrix::createByPaddleMatrixPtr(void* sharedPtr) {
-  auto* mat = reinterpret_cast<paddle::MatrixPtr*>(sharedPtr);
-  if ((*mat) != nullptr) {
-    auto m = new Matrix();
-    m->m->mat = *mat;
-    return m;
-  } else {
-    return nullptr;
-  }
-}
-
-Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->zero();
-  return m;
-}
-
-Matrix* Matrix::createDense(const std::vector<float>& data,
-                            size_t height,
-                            size_t width,
-                            bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->copyFrom(data.data(), data.size());
-  return m;
-}
-
-Matrix* Matrix::createDenseFromNumpy(float* data,
-                                     int dim1,
-                                     int dim2,
-                                     bool copy,
-                                     bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// Gpu mode only supports copy=True
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
-  } else {
-    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
-  }
-}
-
-Matrix* Matrix::createCpuDenseFromNumpy(float* data,
-                                        int dim1,
-                                        int dim2,
-                                        bool copy) {
-  auto m = new Matrix();
-  if (copy) {
-    m->m->mat = paddle::Matrix::create(dim1, dim2);
-    m->m->mat->copyFrom(data, dim1 * dim2);
-  } else {
-    m->m->mat = paddle::Matrix::create(data, dim1, dim2, false);
-  }
-  return m;
-}
-
-Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
-  m->m->mat->copyFrom(data, dim1 * dim2);
-  return m;
-}
-
-Matrix* Matrix::createSparse(size_t height,
-                             size_t width,
-                             size_t nnz,
-                             bool isNonVal,
-                             bool isTrans,
-                             bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::createSparseMatrix(
-      height,
-      width,
-      nnz,
-      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      isTrans,
-      useGpu);
-  return m;
-}
-
-Matrix::~Matrix() { delete m; }
-
-size_t Matrix::getHeight() const { return m->mat->getHeight(); }
-
-size_t Matrix::getWidth() const { return m->mat->getWidth(); }
-
-float Matrix::get(size_t x, size_t y) const throw(RangeError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  return m->mat->getElement(x, y);
-}
-
-void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
-                                                      UnsupportError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  auto rawMat = m->mat.get();
-  if (auto cDenseMat = dynamic_cast<paddle::CpuMatrix*>(rawMat)) {
-    *(cDenseMat->getData() + x + y * cDenseMat->getWidth()) = val;
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-bool Matrix::isSparse() const {
-  auto raw_mat = m->mat.get();
-  return dynamic_cast<paddle::CpuSparseMatrix*>(raw_mat) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(raw_mat) != nullptr;
-}
-
-SparseValueType Matrix::getSparseValueType() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseValueType)cpuSparseMat->getValueType();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return (SparseValueType)gpuSparseMat->getValueType();
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-SparseFormatType Matrix::getSparseFormat() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseFormatType)cpuSparseMat->getFormat();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return SPARSE_CSR;
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-IntArray Matrix::getSparseRowCols(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getFormat() == paddle::SPARSE_CSR) {
-    if (i < cpuSparseMat->getHeight()) {
-      // cpuSparseMat->print(std::cout);
-      size_t len = cpuSparseMat->getColNum(i);
-      return IntArray(cpuSparseMat->getRowCols(i), len);
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-IntWithFloatArray Matrix::getSparseRowColsVal(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getValueType() == paddle::FLOAT_VALUE) {
-    if (i < cpuSparseMat->getHeight()) {
-      return IntWithFloatArray(cpuSparseMat->getRowValues(i),
-                               cpuSparseMat->getRowCols(i),
-                               cpuSparseMat->getColNum(i));
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-FloatArray Matrix::getData() const {
-  auto rawMat = m->mat.get();
-  if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
-    // is gpu. then copy data
-    float* data = rawMat->getData();
-    size_t len = rawMat->getElementCnt();
-    float* cpuData = new float[len];
-    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
-    FloatArray ret_val(cpuData, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(rawMat->getData(), rawMat->getElementCnt());
-    return ret_val;
-  }
-}
-
-void Matrix::sparseCopyFrom(
-    const std::vector<int>& rows,
-    const std::vector<int>& cols,
-    const std::vector<float>& vals) throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    // LOG(INFO) <<"RowSize = "<<rows.size()
-    //  <<" ColSize = "<<cols.size()
-    //  <<" ValSize = "<<vals.size();
-    cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
-                           const_cast<std::vector<int>&>(cols),
-                           const_cast<std::vector<float>&>(vals));
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-void* Matrix::getSharedPtr() const { return &m->mat; }
-
-void Matrix::toNumpyMatInplace(float** view_data,
-                               int* dim1,
-                               int* dim2) throw(UnsupportError) {
-  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
-  if (cpuMat) {
-    *dim1 = cpuMat->getHeight();
-    *dim2 = cpuMat->getWidth();
-    *view_data = cpuMat->getData();
-  } else {
-    throw UnsupportError();
-  }
-}
-void Matrix::copyToNumpyMat(float** view_m_data,
-                            int* dim1,
-                            int* dim2) throw(UnsupportError) {
-  static_assert(sizeof(paddle::real) == sizeof(float),
-                "Currently PaddleAPI only support for single "
-                "precision version of paddle.");
-  if (this->isSparse()) {
-    throw UnsupportError();
-  } else {
-    *dim1 = m->mat->getHeight();
-    *dim2 = m->mat->getWidth();
-    *view_m_data = new float[(*dim1) * (*dim2)];
-    if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
-      auto src = cpuMat->getData();
-      auto dest = *view_m_data;
-      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
-      auto src = gpuMat->getData();
-      auto dest = *view_m_data;
-      hl_memcpy_device2host(
-          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else {
-      LOG(WARNING) << "Unexpected Situation";
-      throw UnsupportError();
-    }
-  }
-}
-
-void Matrix::copyFromNumpyMat(float* data,
-                              int dim1,
-                              int dim2) throw(UnsupportError, RangeError) {
-  if (isSparse()) {
-    throw UnsupportError();
-  } else {
-    if (this->getHeight() == (size_t)dim1 && this->getWidth() == (size_t)dim2) {
-      if (m->mat->getData() != data) {
-        m->mat->copyFrom(data, dim1 * dim2);
-      }
-    } else {
-      throw RangeError();
-    }
-  }
-}
-
-bool Matrix::isGpu() const {
-  auto rawPtr = m->mat.get();
-  return dynamic_cast<paddle::GpuMatrix*>(rawPtr) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(rawPtr) != nullptr;
-}
diff --git a/paddle/legacy/api/Paddle.i b/paddle/legacy/api/Paddle.i
deleted file mode 100644
index 7a1456a5c..000000000
--- a/paddle/legacy/api/Paddle.i
+++ /dev/null
@@ -1,202 +0,0 @@
-%module(directors="1") swig_paddle
-%include "std_string.i"
-%{
-#define SWIG_FILE_WITH_INIT
-#include "legacy/api/PaddleAPI.h"
-%}
-
-%include "exception.i"
-%typemap(throws) UnsupportError %{
-  SWIG_exception(SWIG_RuntimeError, $1.what());
-  SWIG_fail;
-%}
-
-%include "std_vector.i"
-%include "std_pair.i"
-#ifdef SWIGPYTHON
-%include "numpy.i"
-#endif
-
-%init %{
-#ifdef SWIGPYTHON
-import_array();
-#endif
-%}
-
-
-namespace std {
-%template(vector_int) vector<int>;
-%template(vector_uint) vector<unsigned int>;
-%template(vector_float) vector<float>;
-%template(vector_string) vector<string>;
-%template(vector_vec_star) vector<Vector*>;
-}
-#ifdef SWIGPYTHON 
-%typemap(in) (int argc, char** argv) { 
-    int i = 0; 
-    if (!PyList_Check($input)) { 
-        PyErr_SetString(PyExc_ValueError, "Expecting a list"); 
-        return NULL; 
-    } 
-    $1 = PyList_Size($input); 
-    $2 = (char **) malloc(($1+1)*sizeof(char *)); 
-    for (i = 0; i < $1; i++) { 
-        PyObject *s = PyList_GetItem($input,i); 
-        if (!PyString_Check(s)) { 
-            free($2); 
-            PyErr_SetString(PyExc_ValueError, "List items must be strings"); 
-            return NULL; 
-        } 
-        $2[i] = PyString_AsString(s); 
-    } 
-    $2[i] = 0; 
-} 
-%typemap(freearg) (int argc, char** argv) { 
-    if ($2) free($2); 
-} 
-
-%typemap(out) FloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyFloat_FromDouble($1.buf[i]));
-  }  
-  if($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntArray {
-  $result = PyList_New($1.length);  
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyInt_FromLong($1.buf[i]));  
-  }
-  if ($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntWithFloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyTuple_Pack(2, 
-      PyInt_FromLong($1.idxBuf[i]),
-      PyFloat_FromDouble($1.valBuf[i])
-    ));
-  }
-  if ($1.needFree) {
-    delete [] $1.idxBuf;
-    delete [] $1.valBuf;
-  } 
-}
-
-
-%rename(__getitem__) IVector::get;
-%rename(__setitem__) IVector::set;
-%rename(__len__) IVector::getSize;
-%rename(__getitem__) Vector::get;
-%rename(__setitem__) Vector::set;
-%rename(__len__) Vector::getSize;
-%rename(__len__) Parameter::getSize;
-%rename(__call__) ParameterTraverseCallback::apply;
-%rename(__repr__) Evaluator::toString;
-
-%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
-  (float* data, int dim1, int dim2) 
-}
-
-%apply (float** ARGOUTVIEW_ARRAY2, int* DIM1, int* DIM2) { 
-  (float** view_data, int* dim1, int* dim2) 
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {
-  (float** view_m_data, int* dim1, int* dim2)  
-}
-
-%apply (int** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (int** view_m_data, int* dim1)  
-}
-
-%apply (int* INPLACE_ARRAY1, int DIM1) { 
-  (int* data, int dim) 
-}
-
-%apply (int** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (int** view_data, int* dim1)  
-}
-
-%apply (float* INPLACE_ARRAY1, int DIM1) {
-  (float* data, int dim)
-}
-
-%apply (float** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (float** view_data, int* dim1)
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (float** view_m_data, int* dim1)
-}
-
-#endif
-// The below functions internally create object by "new", so it should use
-// use SWIG to handle gc. There are hints for SWIG to handle GC.
-%newobject Matrix::createZero;
-%newobject Matrix::createSparse;
-%newobject Matrix::createDense;
-%newobject Matrix::createDenseFromNumpy;
-%newobject Matrix::createCpuDenseFromNumpy;
-%newobject Matrix::createGpuDenseFromNumpy;
-%newobject Vector::createZero;
-%newobject Vector::create;
-%newobject Vector::createVectorFromNumpy;
-%newobject Vector::createCpuVectorFromNumpy;
-%newobject Vector::createGpuVectorFromNumpy;
-%newobject IVector::createZero;
-%newobject IVector::create;
-%newobject IVector::createVectorFromNumpy;
-%newobject IVector::createCpuVectorFromNumpy;
-%newobject IVector::createGpuVectorFromNumpy;
-%newobject Trainer::createByCommandLine;
-%newobject Trainer::getForwardOutput;
-%newobject Trainer::getLayerOutput;
-%newobject Arguments::getSlotValue;
-%newobject Arguments::getSlotIds;
-%newobject Arguments::getSlotIn;
-%newobject Arguments::getSlotSequenceStartPositions;
-%newobject Arguments::getSlotSequenceDim;
-%newobject Arguments::createArguments;
-%newobject GradientMachine::createByConfigProtoStr;
-%newobject GradientMachine::createByModelConfig;
-%newobject GradientMachine::asSequenceGenerator;
-%newobject GradientMachine::getParameter;
-%newobject GradientMachine::getLayerOutput;
-%newobject GradientMachine::makeEvaluator;
-%newobject TrainerConfig::createFromTrainerConfigFile;
-%newobject TrainerConfig::getModelConfig;
-%newobject TrainerConfig::getOptimizationConfig;
-%newobject Parameter::getBuf;
-%newobject Parameter::getConfig;
-%newobject ParameterOptimizer::create;
-%newobject ParameterOptimizer::needSpecialTraversal;
-%newobject ParameterUpdater::createLocalUpdater;
-%newobject ParameterUpdater::createRemoteUpdater;
-%newobject ParameterUpdater::createNewRemoteUpdater;
-
-%feature("director") UpdateCallback;
-%feature("autodoc", 1); // To generate method stub, for code hint in ide
-
-// Ignore many private class, and method cannot be handled by swig.
-%ignore MatrixPrivate;
-%ignore TrainerPrivate;
-%ignore IVector::operator[];
-%ignore ArgumentsPrivate;
-%ignore GradientMachinePrivate;
-%ignore TrainerConfigPrivate;
-%ignore ModelConfigPrivate;
-%ignore ParameterPrivate;
-%ignore SequenceGeneratorPrivate;
-%ignore VectorPrivate;
-%ignore ParameterConfigPrivate;
-%ignore OptimizationConfigPrivate;
-%ignore ParameterTraverseCallbackPrivate;
-%include "legacy/utils/GlobalConstants.h"
-%include "legacy/api/PaddleAPI.h"
diff --git a/paddle/legacy/api/PaddleAPI.h b/paddle/legacy/api/PaddleAPI.h
deleted file mode 100644
index 475984a3d..000000000
--- a/paddle/legacy/api/PaddleAPI.h
+++ /dev/null
@@ -1,1054 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdexcept>
-#include <string>
-#include <vector>
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-/// Import PaddlePaddle's enumeration into global namespace.
-using namespace paddle::enumeration_wrapper;  // NOLINT
-
-/**
- * @brief Initialize paddle.
- *
- * In python, this method should be invoked as
- * @code
- *  import sys
- *  import paddle
- *  paddle.initPaddle(sys.argv)
- *  or you can change arguments as any list of str.
- * @endcode
- */
-void initPaddle(int argc, char** argv);
-
-/// Return FLAGS_use_gpu
-bool isUsingGpu();
-
-/// Set the Flags_use_gpu to the given parameter
-void setUseGpu(bool useGpu);
-
-/// Return true if this py_paddle is compiled in GPU Version
-bool isGpuVersion();
-
-/// Return FLAGS_trainer_count
-int getTrainerCount();
-
-/// The Error of IO Operation. Such as file not found, etc.
-class IOError {};
-
-/// Out of range error
-class RangeError {};
-
-/// Not support Error, such as access GPU memory directly, etc.
-class UnsupportError : public std::runtime_error {
- public:
-  UnsupportError() : std::runtime_error(" ") {}
-  explicit UnsupportError(const std::string& message)
-      : std::runtime_error(message) {}
-};
-
-/// This type will map to python's list of float.
-struct FloatArray {
-  const float* buf;
-  const size_t length;
-  bool needFree;  // true if the buf is dynamic alloced.
-  FloatArray(const float* b, const size_t l);
-};
-
-/// This type will map to python's list of int
-struct IntArray {
-  const int* buf;
-  const size_t length;
-  bool needFree;
-  IntArray(const int* b, const size_t l, bool f = false);
-};
-
-/// This type will map to python's list of (int, float)
-struct IntWithFloatArray {
-  const float* valBuf;
-  const int* idxBuf;
-  const size_t length;
-  bool needFree;
-  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
-};
-
-enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
-
-enum SparseFormatType { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-/**
- * In Python, -1UL is hard to write. So define a const value used by python
- * side.
- */
-const size_t NO_SPARSE_ID = -1UL;
-
-struct MatrixPrivate;
-class Matrix {
-  Matrix();  // User Cannot Create Matrix.
-  DISABLE_COPY(Matrix);
-  static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
-
- public:
-  virtual ~Matrix();
-
-  /**
-   * Create A Matrix with height,width, which is filled by zero.
-   */
-  static Matrix* createZero(size_t height,
-                            size_t width,
-                            bool useGpu = isUsingGpu());
-
-  /**
-   * Create Sparse Matrix.
-   *
-   * After create sparse, sparseCopyFrom can be used to fill matrix.
-   *
-   * @param nnz  Number of non zero values.
-   *
-   * @note the default sparse type is SPARSE_CSR.
-   */
-  static Matrix* createSparse(size_t height,
-                              size_t width,
-                              size_t nnz,
-                              bool isNonVal = true,
-                              bool trans = false,
-                              bool useGpu = isUsingGpu());
-
-  /**
-   * Create Dense Matrix.
-   *
-   * @param data  list of float should be passed in python.
-   * @note        the value will be copy into a new matrix.
-   */
-  static Matrix* createDense(const std::vector<float>& data,
-                             size_t height,
-                             size_t width,
-                             bool useGpu = isUsingGpu());
-
-  static Matrix* createDenseFromNumpy(
-      float* data,
-      int dim1,
-      int dim2,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
-   *
-   *  @param data  a numpy matrix.
-   *  @param dim1  dimension of data.
-   *  @param dim2  dimension of data.
-   *  @param copy  true if copy into a new matrix, false will create
-   *               matrix inplace. copy = false should be used with extreme
-   *               care because Matrix will share the memory with the given
-   *               numpy array. If the numpy array object is no longer valid,
-   *               the memory space will not be usable.
-   */
-  static Matrix* createCpuDenseFromNumpy(float* data,
-                                         int dim1,
-                                         int dim2,
-                                         bool copy = true);
-
-  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
-  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
-
-  /**
-   * Cast to numpy matrix.
-   *
-   * @note    This method take no parameter in python.
-   * @note    This method in python will return a numpy matrix, not void.
-   * @note    Only CpuDenseMatrix is supported.
-   *
-   * Example:
-   * @code
-   * import paddle
-   * m = paddle.Matrix.createZero(10,2)
-   * numpy_mat = m.toNumpyMat()
-   * @endcode
-   */
-  void toNumpyMatInplace(float** view_data,
-                         int* dim1,
-                         int* dim2) throw(UnsupportError);
-
-  /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data,
-                      int* dim1,
-                      int* dim2) throw(UnsupportError);
-
-  /// Copy From Numpy Mat
-  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
-                                                               RangeError);
-
-  /// return true if this matrix is sparse.
-  bool isSparse() const;
-
-  SparseValueType getSparseValueType() const throw(UnsupportError);
-
-  SparseFormatType getSparseFormat() const throw(UnsupportError);
-
-  IntArray getSparseRowCols(size_t i) const throw(UnsupportError, RangeError);
-
-  IntWithFloatArray getSparseRowColsVal(size_t i) const
-      throw(UnsupportError, RangeError);
-
-  size_t getHeight() const;
-
-  size_t getWidth() const;
-
-  float get(size_t x, size_t y) const throw(RangeError);
-
-  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
-
-  /// return type is list of float
-  FloatArray getData() const;
-
-  /**
-   * Copy from rows, cols, values.
-   *
-   * if sparse_nonvalue, the values should be []
-   */
-  void sparseCopyFrom(const std::vector<int>& rows,
-                      const std::vector<int>& cols,
-                      const std::vector<float>& values =
-                          std::vector<float>()) throw(UnsupportError);
-
-  bool isGpu() const;
-
- private:
-  void* getSharedPtr() const;
-
-  MatrixPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class Arguments;
-};
-
-struct VectorPrivate;
-class Vector {
-  DISABLE_COPY(Vector);
-  Vector();
-  static Vector* createByPaddleVectorPtr(void* ptr);
-
-  void* getSharedPtr();
-
- public:
-  ~Vector();
-
-  /// Create Vector filled with zero.
-  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create Vector from list of float.
-   *
-   * It will create a new vector, and copy data into it.
-   */
-  static Vector* create(const std::vector<float>& data,
-                        bool useGpu = isUsingGpu());
-
-  static Vector* createVectorFromNumpy(
-      float* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-  /**
-   * Create Cpu Vector from numpy array, which dtype=float32
-   *
-   * If copy is false, it will create vector inplace.
-   */
-  static Vector* createCpuVectorFromNumpy(float* data,
-                                          int dim,
-                                          bool copy = true);
-
-  /// Create Gpu Vector from numpy array, which dtype=float32
-  static Vector* createGpuVectorFromNumpy(float* data, int dim);
-
-  /**
-   * copy from another vector
-   * throw(RangeError) if size of src vector is different from size of this
-   * vector
-   */
-  void copyFrom(Vector* src) throw(RangeError);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(float** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(float* data, int dim);
-
-  /// __getitem__ in python
-  float get(const size_t idx) const throw(RangeError, UnsupportError);
-
-  /// __setitem__ in python
-  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
-
-  /// Return is GPU vector or not.
-  bool isGpu() const;
-
-  /// Return a list of float, the memory is alloced and copied.
-  FloatArray getData() const;
-
-  /// __len__ in python
-  size_t getSize() const;
-
- private:
-  VectorPrivate* m;
-
- private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct IVectorPrivate;
-class IVector {
-  IVector();
-  DISABLE_COPY(IVector);
-  static IVector* createByPaddleVectorPtr(void* ptr);
-
- public:
-  /// Create IVector filled with zero
-  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create IVector from list of int.
-   * It will create a new vector, and copy data into it.
-   */
-  static IVector* create(const std::vector<int>& data,
-                         bool useGpu = isUsingGpu());
-
-  static IVector* createVectorFromNumpy(
-      int* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   * Create Cpu IVector from numpy array, which dtype=int32
-   *
-   * If copy is false, it will create vector inplace
-   */
-  static IVector* createCpuVectorFromNumpy(int* data,
-                                           int dim,
-                                           bool copy = true);
-  /**
-   * Create Gpu IVector from numpy array, which dtype=int32
-   */
-  static IVector* createGpuVectorFromNumpy(int* data, int dim);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(int** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(int* data, int dim);
-
-  virtual ~IVector();
-
-  /// Return a list of int, the memory is alloced and copied.
-  IntArray getData() const;
-
-  /// This method will map to python [] method.
-  int& operator[](const size_t idx) throw(RangeError, UnsupportError);
-
-  const int& operator[](const size_t idx) const
-      throw(RangeError, UnsupportError);
-
-  inline int get(const size_t idx) const throw(RangeError, UnsupportError) {
-    return (*this)[idx];
-  }
-
-  inline void set(const size_t idx, int val) throw(RangeError, UnsupportError) {
-    (*this)[idx] = val;
-  }
-
-  /// Return true if it is gpu vector.
-  bool isGpu() const;
-
-  /// This method will map to python __len__();
-  size_t getSize() const;
-
- private:
-  void* getSharedPtr() const;
-
-  friend class Arguments;
-  IVectorPrivate* m;
-};
-
-struct ArgumentsPrivate;
-
-/// The Arguments is actual a std::vector<paddle::Argument> in paddle.
-class Arguments {
- private:
-  Arguments();  // Internal Create.
-  DISABLE_COPY(Arguments);
-
- public:
-  /**
-   * Create a arguments with size.
-   * Note that it can be zero.
-   */
-  static Arguments* createArguments(size_t slotNum);
-
-  void resize(size_t slotNum);
-
-  virtual ~Arguments();
-
-  /**
-   * Return the slot number that aguments contains.
-   *
-   * It is actually the vector's size
-   */
-  size_t getSlotNum() const;
-
-  /**
-   * The get functions of Arguments
-   *
-   * the param idx is the slot id
-   */
-  Matrix* getSlotValue(size_t idx) const throw(RangeError);
-  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
-  IVector* getSlotIds(size_t idx) const throw(RangeError);
-  Matrix* getSlotIn(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSubSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceDim(size_t idx) const throw(RangeError);
-  // End Of get functions of Arguments
-
-  int64_t getBatchSize(size_t idx = 0) const throw(RangeError);
-
-  /**
-   * The set functions of Arguments.
-   *
-   * The param idx is the slot id.
-   * The other param is the input Matrix or vector.
-   */
-  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
-  void setSlotSequenceStartPositions(size_t idx,
-                                     IVector* vec) throw(RangeError);
-  void setSlotSubSequenceStartPositions(size_t idx,
-                                        IVector* vec) throw(RangeError);
-  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameHeight(size_t idx, size_t h) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameWidth(size_t idx, size_t w) throw(RangeError);
-
-  size_t getSlotFrameHeight(size_t idx = 0) const throw(RangeError);
-  size_t getSlotFrameWidth(size_t idx = 0) const throw(RangeError);
-
-  float sum() const;
-
- private:
-  static Arguments* createByPaddleArgumentVector(void* ptr);
-  static Arguments* createByPaddleArgument(const void* ptr);
-  void* getInternalArgumentsPtr() const;
-
- private:
-  ArgumentsPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class SequenceGenerator;
-};
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
-  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
-      paddle::GradientMachine::kSgdSparseCpuTraining,
-  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
-};
-
-struct ParameterConfigPrivate;
-class ParameterConfig {
-  DISABLE_COPY(ParameterConfig);
-  ParameterConfig();
-
-  /**
-   * Internal methods
-   */
-  static ParameterConfig* createParameterConfigFromParameterSharedPtr(
-      void* ptr);
-  static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
-  void* getRawPtr();
-
- public:
-  ~ParameterConfig();
-
-  /**
-   * return proto buf string.
-   */
-  std::string toProtoString() const;
-
- private:
-  ParameterConfigPrivate* m;
-
- private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct OptimizationConfigPrivate;
-class OptimizationConfig {
-  DISABLE_COPY(OptimizationConfig);
-  OptimizationConfig();
-
- public:
-  static OptimizationConfig* createFromProtoString(const std::string& str);
-  ~OptimizationConfig();
-
-  /**
-   * return protobuf string.
-   */
-  std::string toProtoString();
-
- private:
-  OptimizationConfigPrivate* m;
-
-  friend class TrainerConfig;
-  friend class ParameterOptimizer;
-  friend class ParameterUpdater;
-  friend class Trainer;
-};
-
-struct ParameterPrivate;
-class Parameter {
- private:
-  Parameter();
-  DISABLE_COPY(Parameter);
-
- public:
-  virtual ~Parameter();
-
-  /**
-   * get parameter name
-   */
-  std::string getName() const;
-
-  /**
-   * get buf in Parameter
-   */
-  Vector* getBuf(ParameterType type);
-
-  /**
-   * get id
-   */
-  size_t getID() const;
-
-  ParameterConfig* getConfig();
-  void setValueUpdated();
-
-  bool save(const std::string& filename) const;
-
-  bool load(const std::string& filename) const;
-
-  size_t getSize() const;
-
- private:
-  static Parameter* createFromRawPtr(void* ptr);
-  static Parameter* createFromSharedPtr(void* ptr);
-
- private:
-  ParameterPrivate* m;
-  friend class UpdateCallbackWrapper;
-  friend class GradientMachine;
-  friend class ParameterUpdater;
-};
-
-struct ModelConfigPrivate;
-/**
- * You can only get model config from TrainerConfig.
- *
- * It is used by GradientMachine.
- */
-class ModelConfig {
- private:
-  ModelConfig();
-  DISABLE_COPY(ModelConfig);
-
- public:
-  virtual ~ModelConfig();
-
- private:
-  ModelConfigPrivate* m;
-  friend class TrainerConfig;
-  friend struct TrainerConfigPrivate;
-  friend class GradientMachine;
-};
-
-struct TrainerConfigPrivate;
-/**
- * To get TrainerConfig from file.
- *
- * It is used by GradientMachine.
- */
-class TrainerConfig {
- private:
-  TrainerConfig();
-  DISABLE_COPY(TrainerConfig);
-
- public:
-  virtual ~TrainerConfig();
-
-  static TrainerConfig* createFromTrainerConfigFile(
-      const std::string& configPath);
-  static TrainerConfig* createFromProtoString(const std::string& str);
-
-  ModelConfig* getModelConfig() const;
-
-  OptimizationConfig* getOptimizationConfig() const;
-
- private:
-  TrainerConfigPrivate* m;
-  friend class Trainer;
-};
-
-/**
- * The callback in backword.
- *
- * You can inherit this class in python.
- *
- * @code
- * class UpdateCallbackInPython(paddle.UpdateCallback):
- *   def __init__(self):
- *     paddle.UpdateCallback.__init__(self)
- *
- *   def apply(self, param):
- *     assert isinstance(param, paddle.Parameter)
- * @endcode
- */
-class UpdateCallback {
- public:
-  virtual ~UpdateCallback();
-  virtual void apply(Parameter* p);
-};
-
-struct ParameterTraverseCallbackPrivate;
-class ParameterTraverseCallback {
-  DISABLE_COPY(ParameterTraverseCallback);
-  ParameterTraverseCallback();
-
- public:
-  ~ParameterTraverseCallback();
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& config,
-             size_t sparseId);
-
- private:
-  ParameterTraverseCallbackPrivate* m;
-  friend class ParameterOptimizer;
-};
-
-/**
- * The ParameterOptimizer Wrapper Class.
- *
- * Basically same as common/ParameterOptimizer.h
- */
-struct ParameterOptimizerPrivate;
-class ParameterOptimizer {
-  DISABLE_COPY(ParameterOptimizer);
-  ParameterOptimizer();
-
- public:
-  static ParameterOptimizer* create(OptimizationConfig* config);
-
-  ~ParameterOptimizer();
-
-  void init(size_t numRows, const ParameterConfig* config);
-
-  void startPass();
-
-  void finishPass();
-
-  void startBatch(size_t numSamplesProcessed);
-
-  void finishBatch();
-
-  void update(const std::vector<Vector*>& vecs,
-              const ParameterConfig& conf,
-              size_t sparseId = NO_SPARSE_ID);
-
-  std::vector<int> getParameterTypes() const;
-
-  ParameterTraverseCallback* needSpecialTraversal(
-      const ParameterConfig& config) const;
-
- private:
-  ParameterOptimizerPrivate* m;
-};
-
-class SequenceGenerator;
-class Evaluator;
-struct GradientMachinePrivate;
-class GradientMachine {
- private:
-  GradientMachine();
-  DISABLE_COPY(GradientMachine);
-
- public:
-  virtual ~GradientMachine();
-
-  /**
-   * Create By ProtoStr.
-   *
-   * The ProtoStr can be generate by python's protobuf code.
-   */
-  static GradientMachine* createByConfigProtoStr(
-      const std::string& protoStr,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * Create by ModelConfig object.
-   *
-   * To get ModelConfig, you can get TrainerConfig from config file, then get
-   * model config by TrainerConfig
-   */
-  static GradientMachine* createByModelConfig(
-      ModelConfig* conf,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * @brief finish
-   */
-  void finish();
-
-  void start();
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  void prefetch(const Arguments& inArgs);
-
-  /**
-   * Do some thing when train pass ended.
-   */
-  void onPassEnd();
-
-  /**
-   * The forward stage of GradientMachine.
-   *
-   * @note  the outArgs could be zero length arguemnts.
-   * @note  THIS METHOD IS VERY USEFULL FOR PREDICT FROM TRAINED MODEL.
-   */
-  void forward(const Arguments& inArgs, Arguments* outArgs, PassType passType);
-
-  /**
-   * The backward stage of GradientMachine.
-   *
-   * @note  Currently the ParameterUpdater is not wrapped in SWIG, so backward
-   * cannot actually train a network. But you can write a update callback to
-   * change the parameter or implement a ParameterUpdater in python side.
-   */
-  void backward(const UpdateCallback& callback = UpdateCallback());
-
-  /**
-   * Combine forward/backward
-   */
-  void forwardBackward(const Arguments& inArgs,
-                       Arguments* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback = UpdateCallback());
-
-  void loadParameters(const std::string& path);
-
-  size_t getParameterSize() const;
-  Parameter* getParameter(size_t i) throw(RangeError);
-
-  size_t getNonStaticParameterSize() const;
-  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
-
-  void randParameters();
-
-  Arguments* getLayerOutput(const std::string& layerName) const
-      throw(UnsupportError);
-
-  /**
-   * Create a sequence generator.
-   *
-   * @note  It just like a paddle_gen_sequence.
-   */
-  SequenceGenerator* asSequenceGenerator(
-      const std::vector<std::string>& dict = std::vector<std::string>(),
-      size_t begin_id = 0UL,
-      size_t end_id = 0UL,
-      size_t max_length = 100UL,
-      size_t beam_size = -1UL);
-
-  Evaluator* makeEvaluator();
-
-  void eval(Evaluator* evaluator);
-
- private:
-  GradientMachinePrivate* m;
-
-  static GradientMachine* createFromPaddleModelPtr(
-      const void* confPtr,
-      GradientMatchineCreateMode mode,
-      const std::vector<int>& types);
-
-  // Not to use c++ 11 init-list, so we use static var as function default arg.
-  static std::vector<int> defaultParamTypes;
-  friend class Trainer;
-  friend class ParameterUpdater;
-};
-
-struct ParameterUpdaterPrivate;
-class ParameterUpdater {
- private:
-  ParameterUpdater();
-
- public:
-  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
-  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
-                                               int passCount,
-                                               bool useSparseUpdater);
-  static ParameterUpdater* createNewRemoteUpdater(
-      OptimizationConfig* config,
-      const std::string pserverSpec,
-      const bool useEtcd) throw(UnsupportError);
-  ~ParameterUpdater();
-
-  /**
-   * @brief initialize Parameter Updater by GradientMachine.
-   * @param gm
-   */
-  void init(const GradientMachine& gm);
-
-  /**
-   * @brief begin of a training/testing of one pass.
-   */
-  void startPass();
-
-  /**
-   * @brief end of a traning/testing of one pass.
-   */
-  void finishPass();
-
-  /**
-   * @brief begin of a training/testing of one batch.
-   * @param data batch's size
-   * @return PassType, mostly will be training.
-   */
-  PassType startBatch(size_t batchSize);
-
-  /**
-   * @brief end of a traning/testing of one batch
-   * @param cost current batch cost.
-   */
-  void finishBatch(float cost);
-
-  /**
-   * @brief update a parameter (by local optimizer or by cluster pserver)
-   * @param param
-   */
-  void update(Parameter* param);
-
-  /**
-   * @breif only get required sparse rows by default.
-   * @param fullSize: get full matrix parameter if *fullSize* set
-   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
-   */
-  void getParametersRemote(bool fullSize = false, bool apply = false);
-
-  /**
-   * @brief restore the average parameter.
-   * @note It is only used in AverageOptimizer. Restore will get the current
-   * PARAMETER_VALUE back.
-   */
-  void restore();
-
-  /**
-   * @brief apply. Store the average parameter.
-   * @note It is only used in AverageOptimizer. Apply will store the current
-   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
-   * it to PARAMETER_VALUE.
-   */
-  void apply();
-
-  /**
-   * @brief catchUpWith The Regularization will be delayed in many situations(
-   * pserver, local sparse). Catch Up means catch the regularization up, apply
-   * regularization to all params.
-   */
-  void catchUpWith();
-
- private:
-  ParameterUpdaterPrivate* m;
-};
-
-struct EvaluatorPrivate;
-class Evaluator {
- private:
-  Evaluator();
-  DISABLE_COPY(Evaluator);
-
- public:
-  ~Evaluator();
-
-  /**
-   * @brief begin an evaluate stage.
-   */
-  void start();
-
-  /**
-   * @brief end an evaluate stage.
-   */
-  void finish();
-
-  /**
-   * @brief toString will get a evaluate result.
-   *
-   * __repr__ method in python
-   */
-  std::string toString();
-
-  std::vector<std::string> getNames() const;
-
-  double getValue(const std::string name) const;
-
- private:
-  EvaluatorPrivate* m;
-
-  friend class GradientMachine;
-};
-
-struct TrainerPrivate;
-class Trainer {
- private:
-  TrainerPrivate* m;
-  Trainer();
-  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
-  DISABLE_COPY(Trainer);
-
- public:
-  virtual ~Trainer();
-
-  /// Create A Trainer By TrainerConfig. using paddle command line.
-  static Trainer* createByCommandLine() throw(IOError);
-
-  static Trainer* create(TrainerConfig* optConfig,
-                         GradientMachine* gm) throw(IOError);
-
-  /// Start training
-  void startTrain();
-
-  /// Finish training
-  void finishTrain();
-
-  /// Start a pass.
-  void startTrainPass();
-
-  /// Finish a pass
-  void finishTrainPass();
-
-  /**
-   * Train one batch,
-   *
-   * @return true if all batch finished.
-   */
-  bool trainOneBatch(size_t batchSize);
-
-  void trainOneDataBatch(size_t batchSize, const Arguments& args);
-
-  void startTestPeriod();
-  void testOneDataBatch(size_t batchSize, const Arguments& args);
-  void finishTestPeriod();
-
-  void forwardOneBatch(size_t batchSize);
-
-  Arguments* getForwardOutput();
-
-  Arguments* getLayerOutput(const std::string& layerName) const;
-};
-
-/// the N-Best results generated from one input sequence.
-class ISequenceResults {
- public:
-  virtual ~ISequenceResults();
-
-  /// Number of result.
-  virtual size_t getSize() const = 0;
-
-  /**
-   * Get sentence from dictionary.
-   *
-   * @param id  the index of result.
-   * @param split  if true, the return sentence will be splited with ' ' by
-   *               each word. Default is false.
-   */
-  virtual std::string getSentence(size_t id, bool split = false) const
-      throw(RangeError) = 0;
-  virtual std::vector<int> getSequence(size_t id) const throw(RangeError) = 0;
-  virtual float getScore(size_t id) const throw(RangeError) = 0;
-};
-
-struct SequenceGeneratorPrivate;
-class SequenceGenerator {
-  DISABLE_COPY(SequenceGenerator);
-  SequenceGenerator();
-
- public:
-  virtual ~SequenceGenerator();
-
-  /**
-   * Generate Sequence by input.
-   *
-   * @note  The inArgs is just one sequence of data.
-   * @note  The return will get a N-best generate result by inArgs.
-   *        Sort by score.
-   */
-  ISequenceResults* generateSequence(const Arguments& inArgs) const;
-
-  void setDict(const std::vector<std::string>& dict);
-  void setBos(size_t bos);
-  void setEos(size_t eos);
-  void setMaxLength(size_t maxlength);
-  void setBeamSize(size_t beamSize);
-
- private:
-  static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
-  friend class GradientMachine;
-
- private:
-  SequenceGeneratorPrivate* m;
-};
diff --git a/paddle/legacy/api/PaddleAPIPrivate.h b/paddle/legacy/api/PaddleAPIPrivate.h
deleted file mode 100644
index 3ee192c31..000000000
--- a/paddle/legacy/api/PaddleAPIPrivate.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <memory>
-#include "PaddleAPI.h"
-#include "paddle/legacy/gserver/evaluators/Evaluator.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
-#include "paddle/legacy/trainer/TrainerConfigHelper.h"
-
-struct GradientMachinePrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-};
-
-struct OptimizationConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
-  paddle::OptimizationConfig config;
-
-  const paddle::OptimizationConfig& getConfig() {
-    if (trainer_config != nullptr) {
-      return trainer_config->getOptConfig();
-    } else {
-      return config;
-    }
-  }
-};
-
-struct TrainerConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-  TrainerConfigPrivate() {}
-};
-
-struct ModelConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-};
-
-struct ArgumentsPrivate {
-  std::vector<paddle::Argument> outputs;
-
-  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
-    if (idx < outputs.size()) {
-      return outputs[idx];
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-  template <typename T>
-  std::shared_ptr<T>& cast(void* rawPtr) const {
-    return *(std::shared_ptr<T>*)(rawPtr);
-  }
-};
-
-struct ParameterUpdaterPrivate {
-  std::unique_ptr<paddle::ParameterUpdater> updater;
-};
-
-struct ParameterPrivate {
-  std::shared_ptr<paddle::Parameter> sharedPtr;
-  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
-                              // in other situation sharedPtr should
-                              // contains value.
-
-  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
-
-  paddle::Parameter* getPtr() {
-    if (sharedPtr) {
-      return sharedPtr.get();
-    } else {
-      return rawPtr;
-    }
-  }
-};
-
-struct EvaluatorPrivate {
-  paddle::Evaluator* rawPtr;
-
-  EvaluatorPrivate() : rawPtr(nullptr) {}
-  ~EvaluatorPrivate() { delete rawPtr; }
-};
diff --git a/paddle/legacy/api/Parameter.cpp b/paddle/legacy/api/Parameter.cpp
deleted file mode 100644
index f05740eb7..000000000
--- a/paddle/legacy/api/Parameter.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/parameter/Parameter.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-Parameter::Parameter() : m(new ParameterPrivate()) {}
-
-Parameter::~Parameter() { delete m; }
-
-Parameter* Parameter::createFromRawPtr(void* ptr) {
-  auto p = new Parameter();
-  p->m->rawPtr = *static_cast<paddle::Parameter**>(ptr);
-  return p;
-}
-
-Parameter* Parameter::createFromSharedPtr(void* ptr) {
-  auto& p = *(paddle::ParameterPtr*)(ptr);
-  if (p == nullptr) {
-    return nullptr;
-  } else {
-    auto retParam = new Parameter();
-    retParam->m->sharedPtr = p;
-    return retParam;
-  }
-}
-
-std::string Parameter::getName() const { return m->getPtr()->getName(); }
-
-Vector* Parameter::getBuf(ParameterType type) {
-  auto buf = m->getPtr()->getBuf(type);
-  return Vector::createByPaddleVectorPtr(&buf);
-}
-
-ParameterConfig* Parameter::getConfig() {
-  if (m->sharedPtr) {
-    return ParameterConfig::createParameterConfigFromParameterSharedPtr(
-        &m->sharedPtr);
-  } else {
-    return ParameterConfig::createParameterConfigFromParameterPtr(m->rawPtr);
-  }
-}
-
-size_t Parameter::getID() const { return m->getPtr()->getID(); }
-
-void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
-
-bool Parameter::save(const std::string& filename) const {
-  return m->getPtr()->save(filename);
-}
-
-bool Parameter::load(const std::string& filename) const {
-  return m->getPtr()->load(filename);
-}
-
-size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/legacy/api/ParameterOptimizer.cpp b/paddle/legacy/api/ParameterOptimizer.cpp
deleted file mode 100644
index 477d9dae4..000000000
--- a/paddle/legacy/api/ParameterOptimizer.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/parameter/ParameterOptimizer.h"
-#include <algorithm>
-#include "Internal.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-struct ParameterOptimizerPrivate {
-  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
-};
-
-struct ParameterTraverseCallbackPrivate {
-  paddle::ParameterOptimizer::TraverseCallback callback;
-
-  ParameterTraverseCallbackPrivate() {}
-
-  ParameterTraverseCallbackPrivate(
-      const paddle::ParameterOptimizer::TraverseCallback& callback)
-      : callback(callback) {}
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& conf,
-             size_t sparseId) {
-    std::vector<paddle::VectorPtr> real_vecs;
-    real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
-      if (v) {
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
-      } else {
-        return paddle::VectorPtr();
-      }
-    });
-
-    paddle::ParameterConfig& real_conf =
-        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
-                                        .getRawPtr());
-    callback(real_vecs.data(), real_conf, sparseId);
-  }
-};
-
-ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
-
-ParameterOptimizer::~ParameterOptimizer() { delete m; }
-
-ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
-  CHECK(config != nullptr);
-  auto retOptimizer = new ParameterOptimizer();
-  retOptimizer->m->optimizer.reset(
-      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
-  return retOptimizer;
-}
-
-void ParameterOptimizer::init(size_t numRows, const ParameterConfig* config) {
-  auto& conf = *(paddle::ParameterConfig*)(const_cast<ParameterConfig*>(config)
-                                               ->getRawPtr());
-  m->optimizer->init(numRows, &conf);
-}
-
-void ParameterOptimizer::startPass() { m->optimizer->startPass(); }
-
-void ParameterOptimizer::finishPass() { m->optimizer->finishPass(); }
-
-void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
-  constexpr size_t high_1 = 1UL << (sizeof(size_t) * 8 - 1);
-  CHECK_EQ(numSamplesProcessed & high_1, 0UL);  // Safely cast.
-  m->optimizer->startBatch((int64_t)numSamplesProcessed);
-}
-
-void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
-
-void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
-                                const ParameterConfig& conf,
-                                size_t sparseId) {
-  ParameterTraverseCallbackPrivate invoker(
-      [&](const paddle::VectorPtr _vecs[],
-          const paddle::ParameterConfig& config,
-          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
-  invoker.apply(vecs, conf, sparseId);
-}
-
-std::vector<int> ParameterOptimizer::getParameterTypes() const {
-  std::vector<int> returnValue;
-  staticCastVector(&returnValue, m->optimizer->getParameterTypes());
-  return returnValue;
-}
-
-ParameterTraverseCallback::ParameterTraverseCallback()
-    : m(new ParameterTraverseCallbackPrivate()) {}
-
-ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
-
-void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
-                                      const ParameterConfig& conf,
-                                      size_t sparseId) {
-  m->apply(vecs, conf, sparseId);
-}
-
-ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  auto& param_config =
-      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
-           .getRawPtr();
-  auto callback = m->optimizer->needSpecialTraversal(param_config);
-  if (callback) {
-    auto retCallback = new ParameterTraverseCallback();
-    retCallback->m->callback = callback;
-    return retCallback;
-  } else {
-    return nullptr;
-  }
-}
diff --git a/paddle/legacy/api/ParameterUpdater.cpp b/paddle/legacy/api/ParameterUpdater.cpp
deleted file mode 100644
index 44af3f463..000000000
--- a/paddle/legacy/api/ParameterUpdater.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "PaddleAPIPrivate.h"
-#ifndef PADDLE_WITHOUT_GOLANG
-#include "paddle/legacy/trainer/NewRemoteParameterUpdater.h"
-#endif
-#include "paddle/legacy/trainer/RemoteParameterUpdater.h"
-#include "paddle/legacy/trainer/ThreadParameterUpdater.h"
-
-ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
-
-ParameterUpdater *ParameterUpdater::createLocalUpdater(
-    OptimizationConfig *config) {
-  auto updater = new ParameterUpdater();
-  updater->m->updater.reset(
-      new paddle::SgdThreadUpdater(config->m->getConfig()));
-  return updater;
-}
-
-ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
-    OptimizationConfig *config,
-    const std::string pserverSpec,
-    const bool useEtcd) throw(UnsupportError) {
-#ifndef PADDLE_WITHOUT_GOLANG
-  auto updater = new ParameterUpdater();
-  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec, useEtcd));
-  return updater;
-#else
-  throw UnsupportError("not compiled with WITH_GOLANG");
-#endif
-}
-
-ParameterUpdater *ParameterUpdater::createRemoteUpdater(
-    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
-  auto updater = new ParameterUpdater();
-  auto remoteUpdater = new paddle::RemoteParameterUpdater(
-      config->m->getConfig(), passCount, nullptr);
-  if (useSparseUpdater) {
-    std::unique_ptr<paddle::ParameterUpdater> remoteUpdaterPtr(remoteUpdater);
-    auto sparseRemoteUpdater =
-        new paddle::SparseRemoteParameterUpdaterComposite(
-            config->m->getConfig(),
-            passCount,
-            false,
-            std::move(remoteUpdaterPtr));
-    updater->m->updater.reset(sparseRemoteUpdater);
-  } else {
-    updater->m->updater.reset(remoteUpdater);
-  }
-  return updater;
-}
-
-ParameterUpdater::~ParameterUpdater() { delete m; }
-
-void ParameterUpdater::init(const GradientMachine &gm) {
-  m->updater->init(gm.m->machine->getNonStaticParameters());
-}
-
-void ParameterUpdater::startPass() { m->updater->startPass(); }
-
-void ParameterUpdater::finishPass() { m->updater->finishPass(); }
-
-PassType ParameterUpdater::startBatch(size_t batchSize) {
-  return m->updater->startBatch((int64_t)batchSize);
-}
-
-void ParameterUpdater::finishBatch(float cost) {
-  m->updater->finishBatch(cost);
-}
-
-void ParameterUpdater::update(Parameter *param) {
-  auto paddleParam = param->m->getPtr();
-  m->updater->update(paddleParam);
-}
-
-void ParameterUpdater::getParametersRemote(bool fullSize, bool apply) {
-  m->updater->getParametersRemote(fullSize, apply);
-}
-
-void ParameterUpdater::restore() { m->updater->restore(); }
-
-void ParameterUpdater::apply() { m->updater->apply(); }
-
-void ParameterUpdater::catchUpWith() { m->updater->catchUpWith(); }
diff --git a/paddle/legacy/api/SequenceGenerator.cpp b/paddle/legacy/api/SequenceGenerator.cpp
deleted file mode 100644
index 2a73228f6..000000000
--- a/paddle/legacy/api/SequenceGenerator.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <iterator>
-#include <sstream>
-#include <vector>
-#include "PaddleAPI.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/utils/Flags.h"
-
-// used to represent partial sequence
-struct Path {
-  std::vector<int> ids;
-  float logProb;
-  paddle::MachineState machineState;
-
-  Path() { logProb = 0; }
-
-  Path(std::vector<int>& ids, float logProb, paddle::MachineState& machineState)
-      : ids(ids), logProb(logProb), machineState(machineState) {}
-
-  bool operator<(const Path& other) const { return (logProb > other.logProb); }
-};
-
-// Return top k (k == beam_size) optimal paths using beam search. The last
-// element of inArgs is the Argument of feedback. gradMachine has MaxIdLayer
-// as output and outArgs thus stores top k labels and their probabilities per
-// position
-static void findNBest(paddle::GradientMachine* gradMachine,
-                      std::vector<paddle::Argument>& inArgs,
-                      std::vector<Path>& finalPaths,
-                      size_t bos_id,
-                      size_t eos_id,
-                      size_t max_length) {
-  std::vector<Path> paths;
-  Path emptyPath;
-  paths.push_back(emptyPath);
-  finalPaths.clear();
-  gradMachine->resetState();
-  paddle::Argument feedback = inArgs.back();
-  feedback.ids->setElement(0, (int)(bos_id));
-  float minFinalPathLogProb = 0;
-  size_t beam = 0;
-  int id;
-  std::vector<paddle::Argument> outArgs;
-  while (true) {  // iterate over each generated word
-    std::vector<Path> newPaths;
-    paddle::MachineState machineState;
-    for (size_t j = 0; j < paths.size(); j++) {
-      Path& path = paths[j];
-      if (path.machineState.size() > 0) {
-        gradMachine->setState(path.machineState);
-        feedback.ids->setElement(0, path.ids.back());
-      }
-      gradMachine->forward(inArgs, &outArgs, paddle::PASS_TEST);
-      gradMachine->getState(machineState);
-      beam = outArgs[0].ids->getSize();
-      for (size_t k = 0; k < beam; k++) {
-        id = outArgs[0].ids->getElement(k);
-        float prob = outArgs[0].in->getElement(0, k);
-        std::vector<int> nids(path.ids);
-        nids.push_back(id);
-        float newLogProb = path.logProb + log(prob);
-        Path newPath(nids, newLogProb, machineState);
-        if (id == (int)eos_id || nids.size() >= max_length) {
-          finalPaths.push_back(newPath);
-          if (minFinalPathLogProb > newPath.logProb) {
-            minFinalPathLogProb = newPath.logProb;
-          }
-        } else {
-          newPaths.push_back(newPath);
-        }
-      }
-    }
-
-    if (newPaths.size() == 0) {
-      break;
-    }
-    std::nth_element(newPaths.begin(),
-                     newPaths.begin() + std::min(beam, newPaths.size()),
-                     newPaths.end());
-    if (newPaths.size() > beam) {
-      newPaths.resize(beam);
-    }
-    // pathA < pathB means pathA.logProb > pathB.logProb
-    float maxPathLogProb =
-        std::min_element(newPaths.begin(), newPaths.end())->logProb;
-    if (finalPaths.size() >= beam && minFinalPathLogProb >= maxPathLogProb) {
-      break;
-    }
-    paths = newPaths;
-  }  // end while
-
-  std::partial_sort(finalPaths.begin(),
-                    finalPaths.begin() + std::min(beam, finalPaths.size()),
-                    finalPaths.end());
-  if (finalPaths.size() > beam) {
-    finalPaths.resize(beam);
-  }
-}
-
-struct SequenceGeneratorPrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-  std::shared_ptr<std::vector<std::string>> dict;
-  size_t beginPos;
-  size_t endPos;
-  size_t maxLength;
-
-  paddle::Argument feedback;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-
-  inline void findNBest(std::vector<paddle::Argument>& inArgs,
-                        std::vector<Path>& path) {
-    ::findNBest(machine.get(), inArgs, path, beginPos, endPos, maxLength);
-  }
-
-  SequenceGeneratorPrivate()
-      : dict(std::make_shared<std::vector<std::string>>()),
-        beginPos(0UL),
-        endPos(0UL),
-        maxLength(0UL),
-        feedback(__create_feedback__()) {}
-
- private:
-  static paddle::Argument __create_feedback__() {
-    paddle::Argument feedback;
-    feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
-
-    feedback.sequenceStartPositions =
-        paddle::ICpuGpuVector::create(/* size= */ 2, /* useGpu= */ false);
-    feedback.sequenceStartPositions->getMutableData(false)[0] = 0;
-    feedback.sequenceStartPositions->getMutableData(false)[1] = 1;
-    return feedback;
-  }
-};
-
-SequenceGenerator::SequenceGenerator() : m(new SequenceGeneratorPrivate()) {}
-
-SequenceGenerator::~SequenceGenerator() { delete m; }
-
-class PathSequenceResults : public ISequenceResults {
-  // ISequenceResults interface
- public:
-  PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
-                      const std::shared_ptr<std::vector<std::string>>& dict)
-      : path_(path), dict_(dict) {}
-
-  size_t getSize() const { return path_->size(); }
-  std::string getSentence(size_t id, bool split) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      std::ostringstream sout;
-      std::transform(p.ids.begin(),
-                     p.ids.end(),
-                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
-                     [&](int id) { return (*dict_)[id]; });
-      return sout.str();
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  std::vector<int> getSequence(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.ids;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  float getScore(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.logProb;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
- private:
-  std::shared_ptr<std::vector<Path>> path_;
-  std::shared_ptr<std::vector<std::string>> dict_;
-};
-
-ISequenceResults* SequenceGenerator::generateSequence(
-    const Arguments& inArgs) const {
-  auto& in_args =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  for (auto& arg : in_args) {
-    arg.sequenceStartPositions = m->feedback.sequenceStartPositions;
-  }
-  in_args.push_back(m->feedback);
-  auto path = std::make_shared<std::vector<Path>>();
-  m->findNBest(in_args, *path);
-  return new PathSequenceResults(path, m->dict);
-}
-
-SequenceGenerator* SequenceGenerator::createByGradientMachineSharedPtr(
-    void* ptr) {
-  SequenceGenerator* r = new SequenceGenerator();
-  r->m->machine = r->m->cast<std::shared_ptr<paddle::GradientMachine>>(ptr);
-  return r;
-}
-
-void SequenceGenerator::setDict(const std::vector<std::string>& dict) {
-  *m->dict = dict;
-}
-
-void SequenceGenerator::setBos(size_t bos) { m->beginPos = bos; }
-
-void SequenceGenerator::setEos(size_t eos) { m->endPos = eos; }
-
-void SequenceGenerator::setMaxLength(size_t maxLength) {
-  m->maxLength = maxLength;
-}
-
-void SequenceGenerator::setBeamSize(size_t beamSize) {
-  if (beamSize != -1UL) {
-    FLAGS_beam_size = beamSize;
-  }
-}
-
-ISequenceResults::~ISequenceResults() {}
diff --git a/paddle/legacy/api/Trainer.cpp b/paddle/legacy/api/Trainer.cpp
deleted file mode 100644
index e7c607201..000000000
--- a/paddle/legacy/api/Trainer.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include <stdlib.h>
-#include <atomic>
-#include <memory>
-
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/trainer/ParamUtil.h"
-#include "paddle/legacy/trainer/Trainer.h"
-#include "paddle/legacy/trainer/TrainerInternal.h"
-#include "paddle/legacy/utils/Flags.h"
-
-using paddle::real;
-
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_int32(start_pass);
-
-struct TrainerPrivate : public paddle::Trainer {
-  bool _trainOneBatch(size_t batchSize);
-  bool forwardOneBatch(size_t batchSize);
-  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
-  void setBatchSize(size_t batchSize);
-  std::vector<paddle::Argument>& getForwardOutput();
-
-  void startTestPeriod();
-  void finishTestPeriod();
-  void testOneDataBatch(const paddle::DataBatch& dataBatch);
-  TrainerPrivate() : paddle::Trainer() {}
-};
-
-Trainer::Trainer() : m(new TrainerPrivate()) {
-  auto conf = paddle::TrainerConfigHelper::createFromFlags();
-  if (conf != nullptr) {
-    m->init(conf);
-  }
-}
-
-Trainer::~Trainer() { delete m; }
-
-Trainer* Trainer::createByCommandLine() throw(IOError) {
-  auto retv = new Trainer();
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    throw IOError();
-  }
-}
-
-Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
-    : m(new TrainerPrivate()) {
-  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
-}
-
-Trainer* Trainer::create(TrainerConfig* config,
-                         GradientMachine* gm) throw(IOError) {
-  auto retv = new Trainer(config, gm);
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    retv->m->getConfig().CheckInitialized();
-    throw IOError();
-  }
-}
-
-void Trainer::startTrain() { m->startTrain(); }
-
-void Trainer::finishTrain() { m->finishTrain(); }
-
-void Trainer::startTrainPass() { m->startTrainPass(); }
-
-void Trainer::finishTrainPass() { m->finishTrainPass(); }
-
-void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = inArgs.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->trainOneDataBatch(dataBatch);
-}
-
-bool Trainer::trainOneBatch(size_t batchSize) {
-  return m->_trainOneBatch(batchSize);
-}
-
-bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
-  paddle::DataBatch dataBatch;
-  CHECK(dataProvider_) << "data_provider is not specified";
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-  trainOneDataBatch(dataBatch);
-  return false;
-}
-
-void TrainerPrivate::startTestPeriod() {
-  if (!tester_) {
-    createTester();
-  }
-  tester_->startTestPeriod();
-}
-
-void Trainer::startTestPeriod() { m->startTestPeriod(); }
-
-void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
-  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
-}
-
-void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = args.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->testOneDataBatch(dataBatch);
-}
-
-void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
-void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
-
-Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
-  auto nn = this->m->getGradientMachine();
-  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto arg = nn->getLayerOutput(layerName);
-  return Arguments::createByPaddleArgument(&arg);
-}
-
-void Trainer::forwardOneBatch(size_t batchSize) {
-  m->forwardOneBatch(batchSize);
-}
-
-bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
-  CHECK(dataProvider_) << "data_provider is not specified";
-  paddle::DataBatch dataBatch;
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-
-  forwardOneDataBatch(dataBatch.getStreams());
-  return true;
-}
-
-void TrainerPrivate::forwardOneDataBatch(
-    const std::vector<paddle::Argument>& inArgs) {
-  std::vector<paddle::Argument>& outArgs = forwardOutput_;
-
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    trainerInternal_.getGradientMachine()->prefetch(inArgs);
-    trainerInternal_.getParameterUpdater()->getParametersRemote();
-  }
-  trainerInternal_.getGradientMachine()->forward(
-      inArgs, &outArgs, paddle::PASS_TEST);
-}
-
-Arguments* Trainer::getForwardOutput() {
-  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
-}
-
-std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
-  return forwardOutput_;
-}
diff --git a/paddle/legacy/api/Util.cpp b/paddle/legacy/api/Util.cpp
deleted file mode 100644
index b458c4d90..000000000
--- a/paddle/legacy/api/Util.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-
-void initPaddle(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-}
-
-FloatArray::FloatArray(const float* b, const size_t l)
-    : buf(b), length(l), needFree(false) {}
-
-IntArray::IntArray(const int* b, const size_t l, bool f)
-    : buf(b), length(l), needFree(f) {}
-
-IntWithFloatArray::IntWithFloatArray(const float* v,
-                                     const int* i,
-                                     size_t l,
-                                     bool f)
-    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
-
-bool isUsingGpu() { return FLAGS_use_gpu; }
-
-void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
-
-bool isGpuVersion() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-
-int getTrainerCount() { return FLAGS_trainer_count; }
-
-static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
-              "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/legacy/api/Vector.cpp b/paddle/legacy/api/Vector.cpp
deleted file mode 100644
index 73b6d3a15..000000000
--- a/paddle/legacy/api/Vector.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/legacy/math/Vector.h"
-
-#include <cstring>
-
-struct IVectorPrivate {
-  paddle::IVectorPtr vec;
-};
-
-IVector::IVector() : m(new IVectorPrivate()) {}
-
-IVector* IVector::createZero(size_t sz, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(sz, useGpu);
-  v->m->vec->zeroMem();
-  return v;
-}
-
-IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(data.size(), useGpu);
-  v->m->vec->copyFrom(data.data(), data.size());
-  return v;
-}
-
-IVector* IVector::createVectorFromNumpy(int* data,
-                                        int dim,
-                                        bool copy,
-                                        bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=true is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return IVector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return IVector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
-  auto v = new IVector();
-  if (copy) {
-    v->m->vec = paddle::IVector::create(dim, false);
-    v->m->vec->copyFrom(data, dim);
-  } else {
-    v->m->vec = paddle::IVector::create(data, dim, false);
-  }
-  return v;
-}
-
-IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(dim, true);
-  v->m->vec->copyFrom(data, dim);
-  return v;
-}
-
-bool IVector::isGpu() const {
-  return dynamic_cast<paddle::GpuIVector*>(m->vec.get()) != nullptr;
-}
-
-IntArray IVector::getData() const {
-  if (this->isGpu()) {
-    int* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    int* dest = new int[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(int));
-    return IntArray(dest, len, true);
-  } else {
-    return IntArray(m->vec->getData(), m->vec->getSize());
-  }
-}
-
-int& IVector::operator[](const size_t idx) throw(RangeError, UnsupportError) {
-  if (this->isGpu()) {
-    UnsupportError e;
-    throw e;
-  } else {
-    if (idx >= m->vec->getSize()) {
-      RangeError e;
-      throw e;
-    }
-  }
-  return m->vec->getData()[idx];
-}
-
-const int& IVector::operator[](const size_t idx) const
-    throw(RangeError, UnsupportError) {
-  return (*const_cast<IVector*>(this))[idx];
-}
-
-IVector* IVector::createByPaddleVectorPtr(void* ptr) {
-  auto* p = (paddle::IVectorPtr*)ptr;
-  if ((*p) != nullptr) {
-    IVector* vec = new IVector();
-    vec->m->vec = *p;
-    return vec;
-  } else {
-    return nullptr;
-  }
-}
-
-IVector::~IVector() { delete m; }
-
-void* IVector::getSharedPtr() const { return &m->vec; }
-
-size_t IVector::getSize() const { return m->vec->getSize(); }
-
-void IVector::toNumpyArrayInplace(int** data, int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuIVector>(m->vec);
-  if (v) {
-    *data = v->getData();
-    *dim1 = v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new int[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void IVector::copyFromNumpyArray(int* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-struct VectorPrivate {
-  paddle::VectorPtr vec;
-
-  void safeAccessData(const size_t idx,
-                      const std::function<void(float&)>& func) const
-      throw(RangeError, UnsupportError) {
-    auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
-    if (cpuVec != nullptr) {
-      if (idx < vec->getSize()) {
-        func(vec->getData()[idx]);
-      } else {
-        throw RangeError();
-      }
-    } else {
-      throw UnsupportError();
-    }
-  }
-};
-
-Vector::Vector() : m(new VectorPrivate()) {}
-
-Vector::~Vector() { delete m; }
-
-Vector* Vector::createZero(size_t sz, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(sz, useGpu);
-  retVec->m->vec->zero();
-  return retVec;
-}
-
-Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
-  retVec->m->vec->copyFrom(data.data(), data.size());
-  return retVec;
-}
-
-Vector* Vector::createByPaddleVectorPtr(void* ptr) {
-  auto& v = *(paddle::VectorPtr*)(ptr);
-  if (v == nullptr) {
-    return nullptr;
-  } else {
-    auto retVec = new Vector();
-    retVec->m->vec = v;
-    return retVec;
-  }
-}
-
-Vector* Vector::createVectorFromNumpy(float* data,
-                                      int dim,
-                                      bool copy,
-                                      bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=True is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Vector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return Vector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  if (copy) {
-    retVec->m->vec = paddle::Vector::create((size_t)dim, false);
-    retVec->m->vec->copyFrom(data, dim);
-  } else {
-    retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
-  }
-  return retVec;
-}
-
-Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create((size_t)dim, true);
-  retVec->m->vec->copyFrom(data, (size_t)dim);
-  return retVec;
-}
-
-void Vector::toNumpyArrayInplace(float** view_data,
-                                 int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
-  if (v != nullptr) {
-    *view_data = v->getData();
-    *dim1 = (int)v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new float[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void Vector::copyFromNumpyArray(float* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-FloatArray Vector::getData() const {
-  if (this->isGpu()) {
-    float* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    float* dest = new float[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(float));
-    FloatArray ret_val(dest, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
-    return ret_val;
-  }
-}
-
-void Vector::copyFrom(Vector* src) throw(RangeError) {
-  if (src->m->vec->getSize() != m->vec->getSize()) {
-    throw RangeError();
-  }
-  m->vec->copyFrom(*src->m->vec);
-}
-
-bool Vector::isGpu() const {
-  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
-}
-
-float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
-  float r;
-  m->safeAccessData(idx, [&](float& o) { r = o; });
-  return r;
-}
-
-void Vector::set(const size_t idx, float val) throw(RangeError,
-                                                    UnsupportError) {
-  m->safeAccessData(idx, [&](float& o) { o = val; });
-}
-
-size_t Vector::getSize() const { return m->vec->getSize(); }
-
-void* Vector::getSharedPtr() { return &m->vec; }
diff --git a/paddle/legacy/api/__init__.py b/paddle/legacy/api/__init__.py
deleted file mode 100644
index f662d6826..000000000
--- a/paddle/legacy/api/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/api/numpy.i b/paddle/legacy/api/numpy.i
deleted file mode 100644
index 2ddc11de7..000000000
--- a/paddle/legacy/api/numpy.i
+++ /dev/null
@@ -1,3161 +0,0 @@
-/* -*- C -*-  (not really, but good for syntax highlighting) */
-
-/*
- * Copyright (c) 2005-2015, NumPy Developers.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- *        notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials provided
- *        with the distribution.
- *
- *     * Neither the name of the NumPy Developers nor the names of any
- *        contributors may be used to endorse or promote products derived
- *        from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifdef SWIGPYTHON
-
-%{
-#ifndef SWIG_FILE_WITH_INIT
-#define NO_IMPORT_ARRAY
-#endif
-#include "stdio.h"
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <numpy/arrayobject.h>
-%}
-
-/**********************************************************************/
-
-%fragment("NumPy_Backward_Compatibility", "header")
-{
-%#if NPY_API_VERSION < 0x00000007
-%#define NPY_ARRAY_DEFAULT NPY_DEFAULT
-%#define NPY_ARRAY_FARRAY  NPY_FARRAY
-%#define NPY_FORTRANORDER  NPY_FORTRAN
-%#endif
-}
-
-/**********************************************************************/
-
-/* The following code originally appeared in
- * enthought/kiva/agg/src/numeric.i written by Eric Jones.  It was
- * translated from C++ to C by John Hunter.  Bill Spotz has modified
- * it to fix some minor bugs, upgrade from Numeric to numpy (all
- * versions), add some comments and functionality, and convert from
- * direct code insertion to SWIG fragments.
- */
-
-%fragment("NumPy_Macros", "header")
-{
-/* Macros to extract array attributes.
- */
-%#if NPY_API_VERSION < 0x00000007
-%#define is_array(a)            ((a) && PyArray_Check((PyArrayObject*)a))
-%#define array_type(a)          (int)(PyArray_TYPE((PyArrayObject*)a))
-%#define array_numdims(a)       (((PyArrayObject*)a)->nd)
-%#define array_dimensions(a)    (((PyArrayObject*)a)->dimensions)
-%#define array_size(a,i)        (((PyArrayObject*)a)->dimensions[i])
-%#define array_strides(a)       (((PyArrayObject*)a)->strides)
-%#define array_stride(a,i)      (((PyArrayObject*)a)->strides[i])
-%#define array_data(a)          (((PyArrayObject*)a)->data)
-%#define array_descr(a)         (((PyArrayObject*)a)->descr)
-%#define array_flags(a)         (((PyArrayObject*)a)->flags)
-%#define array_enableflags(a,f) (((PyArrayObject*)a)->flags) = f
-%#else
-%#define is_array(a)            ((a) && PyArray_Check(a))
-%#define array_type(a)          PyArray_TYPE((PyArrayObject*)a)
-%#define array_numdims(a)       PyArray_NDIM((PyArrayObject*)a)
-%#define array_dimensions(a)    PyArray_DIMS((PyArrayObject*)a)
-%#define array_strides(a)       PyArray_STRIDES((PyArrayObject*)a)
-%#define array_stride(a,i)      PyArray_STRIDE((PyArrayObject*)a,i)
-%#define array_size(a,i)        PyArray_DIM((PyArrayObject*)a,i)
-%#define array_data(a)          PyArray_DATA((PyArrayObject*)a)
-%#define array_descr(a)         PyArray_DESCR((PyArrayObject*)a)
-%#define array_flags(a)         PyArray_FLAGS((PyArrayObject*)a)
-%#define array_enableflags(a,f) PyArray_ENABLEFLAGS((PyArrayObject*)a,f)
-%#endif
-%#define array_is_contiguous(a) (PyArray_ISCONTIGUOUS((PyArrayObject*)a))
-%#define array_is_native(a)     (PyArray_ISNOTSWAPPED((PyArrayObject*)a))
-%#define array_is_fortran(a)    (PyArray_ISFORTRAN((PyArrayObject*)a))
-}
-
-/**********************************************************************/
-
-%fragment("NumPy_Utilities",
-          "header")
-{
-  /* Given a PyObject, return a string describing its type.
-   */
-  const char* pytype_string(PyObject* py_obj)
-  {
-    if (py_obj == NULL          ) return "C NULL value";
-    if (py_obj == Py_None       ) return "Python None" ;
-    if (PyCallable_Check(py_obj)) return "callable"    ;
-    if (PyString_Check(  py_obj)) return "string"      ;
-    if (PyInt_Check(     py_obj)) return "int"         ;
-    if (PyFloat_Check(   py_obj)) return "float"       ;
-    if (PyDict_Check(    py_obj)) return "dict"        ;
-    if (PyList_Check(    py_obj)) return "list"        ;
-    if (PyTuple_Check(   py_obj)) return "tuple"       ;
-%#if PY_MAJOR_VERSION < 3
-    if (PyFile_Check(    py_obj)) return "file"        ;
-    if (PyModule_Check(  py_obj)) return "module"      ;
-    if (PyInstance_Check(py_obj)) return "instance"    ;
-%#endif
-
-    return "unknown type";
-  }
-
-  /* Given a NumPy typecode, return a string describing the type.
-   */
-  const char* typecode_string(int typecode)
-  {
-    static const char* type_names[25] = {"bool",
-                                         "byte",
-                                         "unsigned byte",
-                                         "short",
-                                         "unsigned short",
-                                         "int",
-                                         "unsigned int",
-                                         "long",
-                                         "unsigned long",
-                                         "long long",
-                                         "unsigned long long",
-                                         "float",
-                                         "double",
-                                         "long double",
-                                         "complex float",
-                                         "complex double",
-                                         "complex long double",
-                                         "object",
-                                         "string",
-                                         "unicode",
-                                         "void",
-                                         "ntypes",
-                                         "notype",
-                                         "char",
-                                         "unknown"};
-    return typecode < 24 ? type_names[typecode] : type_names[24];
-  }
-
-  /* Make sure input has correct numpy type.  This now just calls
-     PyArray_EquivTypenums().
-   */
-  int type_match(int actual_type,
-                 int desired_type)
-  {
-    return PyArray_EquivTypenums(actual_type, desired_type);
-  }
-
-%#ifdef SWIGPY_USE_CAPSULE
-  void free_cap(PyObject * cap)
-  {
-    void* array = (void*) PyCapsule_GetPointer(cap,SWIGPY_CAPSULE_NAME);
-    if (array != NULL) free(array);
-  }
-%#endif
-
-
-}
-
-/**********************************************************************/
-
-%fragment("NumPy_Object_to_Array",
-          "header",
-          fragment="NumPy_Backward_Compatibility",
-          fragment="NumPy_Macros",
-          fragment="NumPy_Utilities")
-{
-  /* Given a PyObject pointer, cast it to a PyArrayObject pointer if
-   * legal.  If not, set the python error string appropriately and
-   * return NULL.
-   */
-  PyArrayObject* obj_to_array_no_conversion(PyObject* input,
-                                            int        typecode)
-  {
-    PyArrayObject* ary = NULL;
-    if (is_array(input) && (typecode == NPY_NOTYPE ||
-                            PyArray_EquivTypenums(array_type(input), typecode)))
-    {
-      ary = (PyArrayObject*) input;
-    }
-    else if is_array(input)
-    {
-      const char* desired_type = typecode_string(typecode);
-      const char* actual_type  = typecode_string(array_type(input));
-      PyErr_Format(PyExc_TypeError,
-                   "Array of type '%s' required.  Array of type '%s' given",
-                   desired_type, actual_type);
-      ary = NULL;
-    }
-    else
-    {
-      const char* desired_type = typecode_string(typecode);
-      const char* actual_type  = pytype_string(input);
-      PyErr_Format(PyExc_TypeError,
-                   "Array of type '%s' required.  A '%s' was given",
-                   desired_type,
-                   actual_type);
-      ary = NULL;
-    }
-    return ary;
-  }
-
-  /* Convert the given PyObject to a NumPy array with the given
-   * typecode.  On success, return a valid PyArrayObject* with the
-   * correct type.  On failure, the python error string will be set and
-   * the routine returns NULL.
-   */
-  PyArrayObject* obj_to_array_allow_conversion(PyObject* input,
-                                               int       typecode,
-                                               int*      is_new_object)
-  {
-    PyArrayObject* ary = NULL;
-    PyObject*      py_obj;
-    if (is_array(input) && (typecode == NPY_NOTYPE ||
-                            PyArray_EquivTypenums(array_type(input),typecode)))
-    {
-      ary = (PyArrayObject*) input;
-      *is_new_object = 0;
-    }
-    else
-    {
-      py_obj = PyArray_FROMANY(input, typecode, 0, 0, NPY_ARRAY_DEFAULT);
-      /* If NULL, PyArray_FromObject will have set python error value.*/
-      ary = (PyArrayObject*) py_obj;
-      *is_new_object = 1;
-    }
-    return ary;
-  }
-
-  /* Given a PyArrayObject, check to see if it is contiguous.  If so,
-   * return the input pointer and flag it as not a new object.  If it is
-   * not contiguous, create a new PyArrayObject using the original data,
-   * flag it as a new object and return the pointer.
-   */
-  PyArrayObject* make_contiguous(PyArrayObject* ary,
-                                 int*           is_new_object,
-                                 int            min_dims,
-                                 int            max_dims)
-  {
-    PyArrayObject* result;
-    if (array_is_contiguous(ary))
-    {
-      result = ary;
-      *is_new_object = 0;
-    }
-    else
-    {
-      result = (PyArrayObject*) PyArray_ContiguousFromObject((PyObject*)ary,
-                                                              array_type(ary),
-                                                              min_dims,
-                                                              max_dims);
-      *is_new_object = 1;
-    }
-    return result;
-  }
-
-  /* Given a PyArrayObject, check to see if it is Fortran-contiguous.
-   * If so, return the input pointer, but do not flag it as not a new
-   * object.  If it is not Fortran-contiguous, create a new
-   * PyArrayObject using the original data, flag it as a new object
-   * and return the pointer.
-   */
-  PyArrayObject* make_fortran(PyArrayObject* ary,
-                              int*           is_new_object)
-  {
-    PyArrayObject* result;
-    if (array_is_fortran(ary))
-    {
-      result = ary;
-      *is_new_object = 0;
-    }
-    else
-    {
-      Py_INCREF(array_descr(ary));
-      result = (PyArrayObject*) PyArray_FromArray(ary,
-                                                  array_descr(ary),
-                                                  NPY_FORTRANORDER);
-      *is_new_object = 1;
-    }
-    return result;
-  }
-
-  /* Convert a given PyObject to a contiguous PyArrayObject of the
-   * specified type.  If the input object is not a contiguous
-   * PyArrayObject, a new one will be created and the new object flag
-   * will be set.
-   */
-  PyArrayObject* obj_to_array_contiguous_allow_conversion(PyObject* input,
-                                                          int       typecode,
-                                                          int*      is_new_object)
-  {
-    int is_new1 = 0;
-    int is_new2 = 0;
-    PyArrayObject* ary2;
-    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
-                                                        typecode,
-                                                        &is_new1);
-    if (ary1)
-    {
-      ary2 = make_contiguous(ary1, &is_new2, 0, 0);
-      if ( is_new1 && is_new2)
-      {
-        Py_DECREF(ary1);
-      }
-      ary1 = ary2;
-    }
-    *is_new_object = is_new1 || is_new2;
-    return ary1;
-  }
-
-  /* Convert a given PyObject to a Fortran-ordered PyArrayObject of the
-   * specified type.  If the input object is not a Fortran-ordered
-   * PyArrayObject, a new one will be created and the new object flag
-   * will be set.
-   */
-  PyArrayObject* obj_to_array_fortran_allow_conversion(PyObject* input,
-                                                       int       typecode,
-                                                       int*      is_new_object)
-  {
-    int is_new1 = 0;
-    int is_new2 = 0;
-    PyArrayObject* ary2;
-    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
-                                                        typecode,
-                                                        &is_new1);
-    if (ary1)
-    {
-      ary2 = make_fortran(ary1, &is_new2);
-      if (is_new1 && is_new2)
-      {
-        Py_DECREF(ary1);
-      }
-      ary1 = ary2;
-    }
-    *is_new_object = is_new1 || is_new2;
-    return ary1;
-  }
-} /* end fragment */
-
-/**********************************************************************/
-
-%fragment("NumPy_Array_Requirements",
-          "header",
-          fragment="NumPy_Backward_Compatibility",
-          fragment="NumPy_Macros")
-{
-  /* Test whether a python object is contiguous.  If array is
-   * contiguous, return 1.  Otherwise, set the python error string and
-   * return 0.
-   */
-  int require_contiguous(PyArrayObject* ary)
-  {
-    int contiguous = 1;
-    if (!array_is_contiguous(ary))
-    {
-      PyErr_SetString(PyExc_TypeError,
-                      "Array must be contiguous.  A non-contiguous array was given");
-      contiguous = 0;
-    }
-    return contiguous;
-  }
-
-  /* Test whether a python object is (C_ or F_) contiguous.  If array is
-   * contiguous, return 1.  Otherwise, set the python error string and
-   * return 0.
-   */
-  int require_c_or_f_contiguous(PyArrayObject* ary)
-  {
-    int contiguous = 1;
-    if (!(array_is_contiguous(ary) || array_is_fortran(ary)))
-    {
-      PyErr_SetString(PyExc_TypeError,
-                      "Array must be contiguous (C_ or F_).  A non-contiguous array was given");
-      contiguous = 0;
-    }
-    return contiguous;
-  }
-
-  /* Require that a numpy array is not byte-swapped.  If the array is
-   * not byte-swapped, return 1.  Otherwise, set the python error string
-   * and return 0.
-   */
-  int require_native(PyArrayObject* ary)
-  {
-    int native = 1;
-    if (!array_is_native(ary))
-    {
-      PyErr_SetString(PyExc_TypeError,
-                      "Array must have native byteorder.  "
-                      "A byte-swapped array was given");
-      native = 0;
-    }
-    return native;
-  }
-
-  /* Require the given PyArrayObject to have a specified number of
-   * dimensions.  If the array has the specified number of dimensions,
-   * return 1.  Otherwise, set the python error string and return 0.
-   */
-  int require_dimensions(PyArrayObject* ary,
-                         int            exact_dimensions)
-  {
-    int success = 1;
-    if (array_numdims(ary) != exact_dimensions)
-    {
-      PyErr_Format(PyExc_TypeError,
-                   "Array must have %d dimensions.  Given array has %d dimensions",
-                   exact_dimensions,
-                   array_numdims(ary));
-      success = 0;
-    }
-    return success;
-  }
-
-  /* Require the given PyArrayObject to have one of a list of specified
-   * number of dimensions.  If the array has one of the specified number
-   * of dimensions, return 1.  Otherwise, set the python error string
-   * and return 0.
-   */
-  int require_dimensions_n(PyArrayObject* ary,
-                           int*           exact_dimensions,
-                           int            n)
-  {
-    int success = 0;
-    int i;
-    char dims_str[255] = "";
-    char s[255];
-    for (i = 0; i < n && !success; i++)
-    {
-      if (array_numdims(ary) == exact_dimensions[i])
-      {
-        success = 1;
-      }
-    }
-    if (!success)
-    {
-      for (i = 0; i < n-1; i++)
-      {
-        sprintf(s, "%d, ", exact_dimensions[i]);
-        strcat(dims_str,s);
-      }
-      sprintf(s, " or %d", exact_dimensions[n-1]);
-      strcat(dims_str,s);
-      PyErr_Format(PyExc_TypeError,
-                   "Array must have %s dimensions.  Given array has %d dimensions",
-                   dims_str,
-                   array_numdims(ary));
-    }
-    return success;
-  }
-
-  /* Require the given PyArrayObject to have a specified shape.  If the
-   * array has the specified shape, return 1.  Otherwise, set the python
-   * error string and return 0.
-   */
-  int require_size(PyArrayObject* ary,
-                   npy_intp*      size,
-                   int            n)
-  {
-    int i;
-    int success = 1;
-    int len;
-    char desired_dims[255] = "[";
-    char s[255];
-    char actual_dims[255] = "[";
-    for(i=0; i < n;i++)
-    {
-      if (size[i] != -1 &&  size[i] != array_size(ary,i))
-      {
-        success = 0;
-      }
-    }
-    if (!success)
-    {
-      for (i = 0; i < n; i++)
-      {
-        if (size[i] == -1)
-        {
-          sprintf(s, "*,");
-        }
-        else
-        {
-          sprintf(s, "%ld,", (long int)size[i]);
-        }
-        strcat(desired_dims,s);
-      }
-      len = strlen(desired_dims);
-      desired_dims[len-1] = ']';
-      for (i = 0; i < n; i++)
-      {
-        sprintf(s, "%ld,", (long int)array_size(ary,i));
-        strcat(actual_dims,s);
-      }
-      len = strlen(actual_dims);
-      actual_dims[len-1] = ']';
-      PyErr_Format(PyExc_TypeError,
-                   "Array must have shape of %s.  Given array has shape of %s",
-                   desired_dims,
-                   actual_dims);
-    }
-    return success;
-  }
-
-  /* Require the given PyArrayObject to to be Fortran ordered.  If the
-   * the PyArrayObject is already Fortran ordered, do nothing.  Else,
-   * set the Fortran ordering flag and recompute the strides.
-   */
-  int require_fortran(PyArrayObject* ary)
-  {
-    int success = 1;
-    int nd = array_numdims(ary);
-    int i;
-    npy_intp * strides = array_strides(ary);
-    if (array_is_fortran(ary)) return success;
-    /* Set the Fortran ordered flag */
-    array_enableflags(ary,NPY_ARRAY_FARRAY);
-    /* Recompute the strides */
-    strides[0] = strides[nd-1];
-    for (i=1; i < nd; ++i)
-      strides[i] = strides[i-1] * array_size(ary,i-1);
-    return success;
-  }
-}
-
-/* Combine all NumPy fragments into one for convenience */
-%fragment("NumPy_Fragments",
-          "header",
-          fragment="NumPy_Backward_Compatibility",
-          fragment="NumPy_Macros",
-          fragment="NumPy_Utilities",
-          fragment="NumPy_Object_to_Array",
-          fragment="NumPy_Array_Requirements")
-{
-}
-
-/* End John Hunter translation (with modifications by Bill Spotz)
- */
-
-/* %numpy_typemaps() macro
- *
- * This macro defines a family of 75 typemaps that allow C arguments
- * of the form
- *
- *    1. (DATA_TYPE IN_ARRAY1[ANY])
- *    2. (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
- *    3. (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
- *
- *    4. (DATA_TYPE IN_ARRAY2[ANY][ANY])
- *    5. (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- *    6. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
- *    7. (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- *    8. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
- *
- *    9. (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
- *   10. (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   11. (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   12. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
- *   13. (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   14. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
- *
- *   15. (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
- *   16. (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   17. (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   18. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, , DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
- *   19. (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   20. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
- *
- *   21. (DATA_TYPE INPLACE_ARRAY1[ANY])
- *   22. (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
- *   23. (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
- *
- *   24. (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
- *   25. (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- *   26. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
- *   27. (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- *   28. (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
- *
- *   29. (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
- *   30. (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   31. (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   32. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
- *   33. (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
- *   34. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
- *
- *   35. (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
- *   36. (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   37. (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   38. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
- *   39. (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
- *   40. (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
- *
- *   41. (DATA_TYPE ARGOUT_ARRAY1[ANY])
- *   42. (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
- *   43. (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
- *
- *   44. (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
- *
- *   45. (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
- *
- *   46. (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
- *
- *   47. (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
- *   48. (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
- *
- *   49. (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- *   50. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
- *   51. (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- *   52. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
- *
- *   53. (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
- *   54. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
- *   55. (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
- *   56. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_FARRAY3)
- *
- *   57. (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- *   58. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_ARRAY4)
- *   59. (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- *   60. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_FARRAY4)
- *
- *   61. (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
- *   62. (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
- *
- *   63. (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- *   64. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
- *   65. (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- *   66. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
- *
- *   67. (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
- *   68. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_ARRAY3)
- *   69. (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
- *   70. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_FARRAY3)
- *
- *   71. (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- *   72. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
- *   73. (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- *   74. (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
- *
- *   75. (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
- *
- * where "DATA_TYPE" is any type supported by the NumPy module, and
- * "DIM_TYPE" is any int-like type suitable for specifying dimensions.
- * The difference between "ARRAY" typemaps and "FARRAY" typemaps is
- * that the "FARRAY" typemaps expect Fortran ordering of
- * multidimensional arrays.  In python, the dimensions will not need
- * to be specified (except for the "DATA_TYPE* ARGOUT_ARRAY1"
- * typemaps).  The IN_ARRAYs can be a numpy array or any sequence that
- * can be converted to a numpy array of the specified type.  The
- * INPLACE_ARRAYs must be numpy arrays of the appropriate type.  The
- * ARGOUT_ARRAYs will be returned as new numpy arrays of the
- * appropriate type.
- *
- * These typemaps can be applied to existing functions using the
- * %apply directive.  For example:
- *
- *     %apply (double* IN_ARRAY1, int DIM1) {(double* series, int length)};
- *     double prod(double* series, int length);
- *
- *     %apply (int DIM1, int DIM2, double* INPLACE_ARRAY2)
- *           {(int rows, int cols, double* matrix        )};
- *     void floor(int rows, int cols, double* matrix, double f);
- *
- *     %apply (double IN_ARRAY3[ANY][ANY][ANY])
- *           {(double tensor[2][2][2]         )};
- *     %apply (double ARGOUT_ARRAY3[ANY][ANY][ANY])
- *           {(double low[2][2][2]                )};
- *     %apply (double ARGOUT_ARRAY3[ANY][ANY][ANY])
- *           {(double upp[2][2][2]                )};
- *     void luSplit(double tensor[2][2][2],
- *                  double low[2][2][2],
- *                  double upp[2][2][2]    );
- *
- * or directly with
- *
- *     double prod(double* IN_ARRAY1, int DIM1);
- *
- *     void floor(int DIM1, int DIM2, double* INPLACE_ARRAY2, double f);
- *
- *     void luSplit(double IN_ARRAY3[ANY][ANY][ANY],
- *                  double ARGOUT_ARRAY3[ANY][ANY][ANY],
- *                  double ARGOUT_ARRAY3[ANY][ANY][ANY]);
- */
-
-%define %numpy_typemaps(DATA_TYPE, DATA_TYPECODE, DIM_TYPE)
-
-/************************/
-/* Input Array Typemaps */
-/************************/
-
-/* Typemap suite for (DATA_TYPE IN_ARRAY1[ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE IN_ARRAY1[ANY])
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE IN_ARRAY1[ANY])
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[1] = { $1_dim0 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 1) ||
-      !require_size(array, size, 1)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(freearg)
-  (DATA_TYPE IN_ARRAY1[ANY])
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[1] = { -1 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 1) ||
-      !require_size(array, size, 1)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_ARRAY1, DIM_TYPE DIM1)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[1] = {-1};
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 1) ||
-      !require_size(array, size, 1)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DATA_TYPE* IN_ARRAY1)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE IN_ARRAY2[ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE IN_ARRAY2[ANY][ANY])
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE IN_ARRAY2[ANY][ANY])
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { $1_dim0, $1_dim1 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(freearg)
-  (DATA_TYPE IN_ARRAY2[ANY][ANY])
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_ARRAY2)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input,
-                                                DATA_TYPECODE,
-                                                &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2) || !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[2] = { -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 2) ||
-      !require_size(array, size, 2) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* IN_FARRAY2)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { $1_dim0, $1_dim1, $1_dim2 };
-  array = obj_to_array_contiguous_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(freearg)
-  (DATA_TYPE IN_ARRAY3[ANY][ANY][ANY])
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  /* for now, only concerned with lists */
-  $1 = PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL, int* is_new_object_array=NULL)
-{
-  npy_intp size[2] = { -1, -1 };
-  PyArrayObject* temp_array;
-  Py_ssize_t i;
-  int is_new_object;
-
-  /* length of the list */
-  $2 = PyList_Size($input);
-
-  /* the arrays */
-  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
-  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
-  is_new_object_array = (int *)calloc($2,sizeof(int));
-
-  if (array == NULL || object_array == NULL || is_new_object_array == NULL)
-  {
-    SWIG_fail;
-  }
-
-  for (i=0; i<$2; i++)
-  {
-    temp_array = obj_to_array_contiguous_allow_conversion(PySequence_GetItem($input,i), DATA_TYPECODE, &is_new_object);
-
-    /* the new array must be stored so that it can be destroyed in freearg */
-    object_array[i] = temp_array;
-    is_new_object_array[i] = is_new_object;
-
-    if (!temp_array || !require_dimensions(temp_array, 2)) SWIG_fail;
-
-    /* store the size of the first array in the list, then use that for comparison. */
-    if (i == 0)
-    {
-      size[0] = array_size(temp_array,0);
-      size[1] = array_size(temp_array,1);
-    }
-
-    if (!require_size(temp_array, size, 2)) SWIG_fail;
-
-    array[i] = (DATA_TYPE*) array_data(temp_array);
-  }
-
-  $1 = (DATA_TYPE**) array;
-  $3 = (DIM_TYPE) size[0];
-  $4 = (DIM_TYPE) size[1];
-}
-%typemap(freearg)
-  (DATA_TYPE** IN_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  Py_ssize_t i;
-
-  if (array$argnum!=NULL) free(array$argnum);
-
-  /*freeing the individual arrays if needed */
-  if (object_array$argnum!=NULL)
-  {
-    if (is_new_object_array$argnum!=NULL)
-    {
-      for (i=0; i<$2; i++)
-      {
-        if (object_array$argnum[i] != NULL && is_new_object_array$argnum[i])
-        { Py_DECREF(object_array$argnum[i]); }
-      }
-      free(is_new_object_array$argnum);
-    }
-    free(object_array$argnum);
-  }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* IN_ARRAY3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_ARRAY3)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
-                                                &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3) | !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* IN_FARRAY3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input,
-                                                   DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 3) ||
-      !require_size(array, size, 3) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* IN_FARRAY3)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { $1_dim0, $1_dim1, $1_dim2 , $1_dim3};
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(freearg)
-  (DATA_TYPE IN_ARRAY4[ANY][ANY][ANY][ANY])
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { -1, -1, -1, -1 };
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-  $5 = (DIM_TYPE) array_size(array,3);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  /* for now, only concerned with lists */
-  $1 = PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL, int* is_new_object_array=NULL)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  PyArrayObject* temp_array;
-  Py_ssize_t i;
-  int is_new_object;
-
-  /* length of the list */
-  $2 = PyList_Size($input);
-
-  /* the arrays */
-  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
-  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
-  is_new_object_array = (int *)calloc($2,sizeof(int));
-
-  if (array == NULL || object_array == NULL || is_new_object_array == NULL)
-  {
-    SWIG_fail;
-  }
-
-  for (i=0; i<$2; i++)
-  {
-    temp_array = obj_to_array_contiguous_allow_conversion(PySequence_GetItem($input,i), DATA_TYPECODE, &is_new_object);
-
-    /* the new array must be stored so that it can be destroyed in freearg */
-    object_array[i] = temp_array;
-    is_new_object_array[i] = is_new_object;
-
-    if (!temp_array || !require_dimensions(temp_array, 3)) SWIG_fail;
-
-    /* store the size of the first array in the list, then use that for comparison. */
-    if (i == 0)
-    {
-      size[0] = array_size(temp_array,0);
-      size[1] = array_size(temp_array,1);
-      size[2] = array_size(temp_array,2);
-    }
-
-    if (!require_size(temp_array, size, 3)) SWIG_fail;
-
-    array[i] = (DATA_TYPE*) array_data(temp_array);
-  }
-
-  $1 = (DATA_TYPE**) array;
-  $3 = (DIM_TYPE) size[0];
-  $4 = (DIM_TYPE) size[1];
-  $5 = (DIM_TYPE) size[2];
-}
-%typemap(freearg)
-  (DATA_TYPE** IN_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  Py_ssize_t i;
-
-  if (array$argnum!=NULL) free(array$argnum);
-
-  /*freeing the individual arrays if needed */
-  if (object_array$argnum!=NULL)
-  {
-    if (is_new_object_array$argnum!=NULL)
-    {
-      for (i=0; i<$2; i++)
-      {
-        if (object_array$argnum[i] != NULL && is_new_object_array$argnum[i])
-        { Py_DECREF(object_array$argnum[i]); }
-      }
-      free(is_new_object_array$argnum);
-    }
-    free(object_array$argnum);
-  }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
- *                    DATA_TYPE* IN_ARRAY4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { -1, -1, -1 , -1};
-  array = obj_to_array_contiguous_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DIM_TYPE) array_size(array,3);
-  $5 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_ARRAY4)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { -1, -1, -1, -1 };
-  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
-                                                &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4) | !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-  $5 = (DIM_TYPE) array_size(array,3);
-}
-%typemap(freearg)
-  (DATA_TYPE* IN_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
- *                    DATA_TYPE* IN_FARRAY4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
-{
-  $1 = is_array($input) || PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
-  (PyArrayObject* array=NULL, int is_new_object=0)
-{
-  npy_intp size[4] = { -1, -1, -1 , -1 };
-  array = obj_to_array_fortran_allow_conversion($input, DATA_TYPECODE,
-                                                   &is_new_object);
-  if (!array || !require_dimensions(array, 4) ||
-      !require_size(array, size, 4) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DIM_TYPE) array_size(array,3);
-  $5 = (DATA_TYPE*) array_data(array);
-}
-%typemap(freearg)
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* IN_FARRAY4)
-{
-  if (is_new_object$argnum && array$argnum)
-    { Py_DECREF(array$argnum); }
-}
-
-/***************************/
-/* In-Place Array Typemaps */
-/***************************/
-
-/* Typemap suite for (DATA_TYPE INPLACE_ARRAY1[ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE INPLACE_ARRAY1[ANY])
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE INPLACE_ARRAY1[ANY])
-  (PyArrayObject* array=NULL)
-{
-  npy_intp size[1] = { $1_dim0 };
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,1) || !require_size(array, size, 1) ||
-      !require_contiguous(array) || !require_native(array)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY1, DIM_TYPE DIM1)
-  (PyArrayObject* array=NULL, int i=1)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,1) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = 1;
-  for (i=0; i < array_numdims(array); ++i) $2 *= array_size(array,i);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DATA_TYPE* INPLACE_ARRAY1)
-  (PyArrayObject* array=NULL, int i=0)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,1) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = 1;
-  for (i=0; i < array_numdims(array); ++i) $1 *= array_size(array,i);
-  $2 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE INPLACE_ARRAY2[ANY][ANY])
-  (PyArrayObject* array=NULL)
-{
-  npy_intp size[2] = { $1_dim0, $1_dim1 };
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_size(array, size, 2) ||
-      !require_contiguous(array) || !require_native(array)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_ARRAY2)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_contiguous(array) ||
-      !require_native(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_FARRAY2, DIM_TYPE DIM1, DIM_TYPE DIM2)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_contiguous(array)
-      || !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DATA_TYPE* INPLACE_FARRAY2)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,2) || !require_contiguous(array) ||
-      !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE INPLACE_ARRAY3[ANY][ANY][ANY])
-  (PyArrayObject* array=NULL)
-{
-  npy_intp size[3] = { $1_dim0, $1_dim1, $1_dim2 };
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_size(array, size, 3) ||
-      !require_contiguous(array) || !require_native(array)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_contiguous(array) ||
-      !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-}
-
-/* Typemap suite for (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL)
-{
-  npy_intp size[2] = { -1, -1 };
-  PyArrayObject* temp_array;
-  Py_ssize_t i;
-
-  /* length of the list */
-  $2 = PyList_Size($input);
-
-  /* the arrays */
-  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
-  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
-
-  if (array == NULL || object_array == NULL)
-  {
-    SWIG_fail;
-  }
-
-  for (i=0; i<$2; i++)
-  {
-    temp_array = obj_to_array_no_conversion(PySequence_GetItem($input,i), DATA_TYPECODE);
-
-    /* the new array must be stored so that it can be destroyed in freearg */
-    object_array[i] = temp_array;
-
-    if ( !temp_array || !require_dimensions(temp_array, 2) ||
-      !require_contiguous(temp_array) ||
-      !require_native(temp_array) ||
-      !PyArray_EquivTypenums(array_type(temp_array), DATA_TYPECODE)
-    ) SWIG_fail;
-
-    /* store the size of the first array in the list, then use that for comparison. */
-    if (i == 0)
-    {
-      size[0] = array_size(temp_array,0);
-      size[1] = array_size(temp_array,1);
-    }
-
-    if (!require_size(temp_array, size, 2)) SWIG_fail;
-
-    array[i] = (DATA_TYPE*) array_data(temp_array);
-  }
-
-  $1 = (DATA_TYPE**) array;
-  $3 = (DIM_TYPE) size[0];
-  $4 = (DIM_TYPE) size[1];
-}
-%typemap(freearg)
-  (DATA_TYPE** INPLACE_ARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  if (array$argnum!=NULL) free(array$argnum);
-  if (object_array$argnum!=NULL) free(object_array$argnum);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* INPLACE_ARRAY3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_ARRAY3)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_FARRAY3, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_contiguous(array) ||
-      !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* INPLACE_FARRAY3)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DATA_TYPE* INPLACE_FARRAY3)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,3) || !require_contiguous(array)
-      || !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE INPLACE_ARRAY4[ANY][ANY][ANY][ANY])
-  (PyArrayObject* array=NULL)
-{
-  npy_intp size[4] = { $1_dim0, $1_dim1, $1_dim2 , $1_dim3 };
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_size(array, size, 4) ||
-      !require_contiguous(array) || !require_native(array)) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_contiguous(array) ||
-      !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-  $5 = (DIM_TYPE) array_size(array,3);
-}
-
-/* Typemap suite for (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = PySequence_Check($input);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (DATA_TYPE** array=NULL, PyArrayObject** object_array=NULL)
-{
-  npy_intp size[3] = { -1, -1, -1 };
-  PyArrayObject* temp_array;
-  Py_ssize_t i;
-
-  /* length of the list */
-  $2 = PyList_Size($input);
-
-  /* the arrays */
-  array = (DATA_TYPE **)malloc($2*sizeof(DATA_TYPE *));
-  object_array = (PyArrayObject **)calloc($2,sizeof(PyArrayObject *));
-
-  if (array == NULL || object_array == NULL)
-  {
-    SWIG_fail;
-  }
-
-  for (i=0; i<$2; i++)
-  {
-    temp_array = obj_to_array_no_conversion(PySequence_GetItem($input,i), DATA_TYPECODE);
-
-    /* the new array must be stored so that it can be destroyed in freearg */
-    object_array[i] = temp_array;
-
-    if ( !temp_array || !require_dimensions(temp_array, 3) ||
-      !require_contiguous(temp_array) ||
-      !require_native(temp_array) ||
-      !PyArray_EquivTypenums(array_type(temp_array), DATA_TYPECODE)
-    ) SWIG_fail;
-
-    /* store the size of the first array in the list, then use that for comparison. */
-    if (i == 0)
-    {
-      size[0] = array_size(temp_array,0);
-      size[1] = array_size(temp_array,1);
-      size[2] = array_size(temp_array,2);
-    }
-
-    if (!require_size(temp_array, size, 3)) SWIG_fail;
-
-    array[i] = (DATA_TYPE*) array_data(temp_array);
-  }
-
-  $1 = (DATA_TYPE**) array;
-  $3 = (DIM_TYPE) size[0];
-  $4 = (DIM_TYPE) size[1];
-  $5 = (DIM_TYPE) size[2];
-}
-%typemap(freearg)
-  (DATA_TYPE** INPLACE_ARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  if (array$argnum!=NULL) free(array$argnum);
-  if (object_array$argnum!=NULL) free(object_array$argnum);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4,
- *                    DATA_TYPE* INPLACE_ARRAY4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_ARRAY4)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DIM_TYPE) array_size(array,3);
-  $5 = (DATA_TYPE*) array_data(array);
-}
-
-/* Typemap suite for (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2,
- *                    DIM_TYPE DIM3, DIM_TYPE DIM4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_FARRAY4, DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_contiguous(array) ||
-      !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = (DIM_TYPE) array_size(array,0);
-  $3 = (DIM_TYPE) array_size(array,1);
-  $4 = (DIM_TYPE) array_size(array,2);
-  $5 = (DIM_TYPE) array_size(array,3);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3,
- *                    DATA_TYPE* INPLACE_FARRAY4)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DIM_TYPE DIM2, DIM_TYPE DIM3, DIM_TYPE DIM4, DATA_TYPE* INPLACE_FARRAY4)
-  (PyArrayObject* array=NULL)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_dimensions(array,4) || !require_contiguous(array)
-      || !require_native(array) || !require_fortran(array)) SWIG_fail;
-  $1 = (DIM_TYPE) array_size(array,0);
-  $2 = (DIM_TYPE) array_size(array,1);
-  $3 = (DIM_TYPE) array_size(array,2);
-  $4 = (DIM_TYPE) array_size(array,3);
-  $5 = (DATA_TYPE*) array_data(array);
-}
-
-/*************************/
-/* Argout Array Typemaps */
-/*************************/
-
-/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY1[ANY])
- */
-%typemap(in,numinputs=0,
-         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
-  (DATA_TYPE ARGOUT_ARRAY1[ANY])
-  (PyObject* array = NULL)
-{
-  npy_intp dims[1] = { $1_dim0 };
-  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE ARGOUT_ARRAY1[ANY])
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
- */
-%typemap(in,numinputs=1,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
-  (PyObject* array = NULL)
-{
-  npy_intp dims[1];
-  if (!PyInt_Check($input))
-  {
-    const char* typestring = pytype_string($input);
-    PyErr_Format(PyExc_TypeError,
-                 "Int dimension expected.  '%s' given.",
-                 typestring);
-    SWIG_fail;
-  }
-  $2 = (DIM_TYPE) PyInt_AsLong($input);
-  dims[0] = (npy_intp) $2;
-  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE* ARGOUT_ARRAY1, DIM_TYPE DIM1)
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
- */
-%typemap(in,numinputs=1,
-         fragment="NumPy_Fragments")
-  (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
-  (PyObject* array = NULL)
-{
-  npy_intp dims[1];
-  if (!PyInt_Check($input))
-  {
-    const char* typestring = pytype_string($input);
-    PyErr_Format(PyExc_TypeError,
-                 "Int dimension expected.  '%s' given.",
-                 typestring);
-    SWIG_fail;
-  }
-  $1 = (DIM_TYPE) PyInt_AsLong($input);
-  dims[0] = (npy_intp) $1;
-  array = PyArray_SimpleNew(1, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $2 = (DATA_TYPE*) array_data(array);
-}
-%typemap(argout)
-  (DIM_TYPE DIM1, DATA_TYPE* ARGOUT_ARRAY1)
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
- */
-%typemap(in,numinputs=0,
-         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
-  (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
-  (PyObject* array = NULL)
-{
-  npy_intp dims[2] = { $1_dim0, $1_dim1 };
-  array = PyArray_SimpleNew(2, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE ARGOUT_ARRAY2[ANY][ANY])
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
- */
-%typemap(in,numinputs=0,
-         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
-  (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
-  (PyObject* array = NULL)
-{
-  npy_intp dims[3] = { $1_dim0, $1_dim1, $1_dim2 };
-  array = PyArray_SimpleNew(3, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE ARGOUT_ARRAY3[ANY][ANY][ANY])
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/* Typemap suite for (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
- */
-%typemap(in,numinputs=0,
-         fragment="NumPy_Backward_Compatibility,NumPy_Macros")
-  (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
-  (PyObject* array = NULL)
-{
-  npy_intp dims[4] = { $1_dim0, $1_dim1, $1_dim2, $1_dim3 };
-  array = PyArray_SimpleNew(4, dims, DATA_TYPECODE);
-  if (!array) SWIG_fail;
-  $1 = ($1_ltype) array_data(array);
-}
-%typemap(argout)
-  (DATA_TYPE ARGOUT_ARRAY4[ANY][ANY][ANY][ANY])
-{
-  $result = SWIG_Python_AppendOutput($result,(PyObject*)array$argnum);
-}
-
-/*****************************/
-/* Argoutview Array Typemaps */
-/*****************************/
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1    )
-  (DATA_TYPE*  data_temp = NULL , DIM_TYPE  dim_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DATA_TYPE** ARGOUTVIEW_ARRAY1, DIM_TYPE* DIM1)
-{
-  npy_intp dims[1] = { *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DATA_TYPE** ARGOUTVIEW_ARRAY1)
-  (DIM_TYPE  dim_temp, DATA_TYPE*  data_temp = NULL )
-{
-  $1 = &dim_temp;
-  $2 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEW_ARRAY1)
-{
-  npy_intp dims[1] = { *$1 };
-  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$2));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
-  (DATA_TYPE*  data_temp = NULL , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DATA_TYPE** ARGOUTVIEW_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
-{
-  npy_intp dims[2] = { *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEW_ARRAY2)
-  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_ARRAY2)
-{
-  npy_intp dims[2] = { *$1, *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
-  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DATA_TYPE** ARGOUTVIEW_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
-{
-  npy_intp dims[2] = { *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEW_FARRAY2)
-  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL  )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEW_FARRAY2)
-{
-  npy_intp dims[2] = { *$1, *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
-  (DATA_TYPE* data_temp = NULL  , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DATA_TYPE** ARGOUTVIEW_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[3] = { *$2, *$3, *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
-                      DATA_TYPE** ARGOUTVIEW_ARRAY3)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL)
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_ARRAY3)
-{
-  npy_intp dims[3] = { *$1, *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DATA_TYPE** ARGOUTVIEW_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[3] = { *$2, *$3, *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
-                      DATA_TYPE** ARGOUTVIEW_FARRAY3)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEW_FARRAY3)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEW_FARRAY3)
-{
-  npy_intp dims[3] = { *$1, *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL  , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DATA_TYPE** ARGOUTVIEW_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEW_ARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEW_ARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL  )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_ARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DATA_TYPE** ARGOUTVIEW_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEW_FARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEW_FARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEW_FARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/*************************************/
-/* Managed Argoutview Array Typemaps */
-/*************************************/
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1    )
-  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY1, DIM_TYPE* DIM1)
-{
-  npy_intp dims[1] = { *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DATA_TYPE** ARGOUTVIEWM_ARRAY1)
-  (DIM_TYPE  dim_temp, DATA_TYPE*  data_temp = NULL  )
-{
-  $1 = &dim_temp;
-  $2 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DATA_TYPE** ARGOUTVIEWM_ARRAY1)
-{
-  npy_intp dims[1] = { *$1 };
-  PyObject* obj = PyArray_SimpleNewFromData(1, dims, DATA_TYPECODE, (void*)(*$2));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
-  (DATA_TYPE*  data_temp = NULL  , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
-{
-  npy_intp dims[2] = { *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEWM_ARRAY2)
-  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL  )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_ARRAY2)
-{
-  npy_intp dims[2] = { *$1, *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1     , DIM_TYPE* DIM2     )
-  (DATA_TYPE*  data_temp = NULL   , DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY2, DIM_TYPE* DIM1, DIM_TYPE* DIM2)
-{
-  npy_intp dims[2] = { *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1     , DIM_TYPE* DIM2     , DATA_TYPE** ARGOUTVIEWM_FARRAY2)
-  (DIM_TYPE  dim1_temp, DIM_TYPE  dim2_temp, DATA_TYPE*  data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DATA_TYPE** ARGOUTVIEWM_FARRAY2)
-{
-  npy_intp dims[2] = { *$1, *$2 };
-  PyObject* obj = PyArray_SimpleNewFromData(2, dims, DATA_TYPECODE, (void*)(*$3));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[3] = { *$2, *$3, *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
-                      DATA_TYPE** ARGOUTVIEWM_ARRAY3)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEWM_ARRAY3)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_ARRAY3)
-{
-  npy_intp dims[3] = { *$1, *$2, *$3 };
-  PyObject* obj= PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    )
-  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY3, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[3] = { *$2, *$3, *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3,
-                      DATA_TYPE** ARGOUTVIEWM_FARRAY3)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DATA_TYPE** ARGOUTVIEWM_FARRAY3)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DATA_TYPE* data_temp = NULL    )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DATA_TYPE** ARGOUTVIEWM_FARRAY3)
-{
-  npy_intp dims[3] = { *$1, *$2, *$3 };
-  PyObject* obj = PyArray_SimpleNewFromData(3, dims, DATA_TYPECODE, (void*)(*$4));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEWM_ARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_ARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEWM_FARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_FARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL    )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL   , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_ARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEWM_ARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_ARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL   )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_ARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2,
-                      DIM_TYPE* DIM3, DIM_TYPE* DIM4)
- */
-%typemap(in,numinputs=0)
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    )
-  (DATA_TYPE* data_temp = NULL    , DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp)
-{
-  $1 = &data_temp;
-  $2 = &dim1_temp;
-  $3 = &dim2_temp;
-  $4 = &dim3_temp;
-  $5 = &dim4_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DATA_TYPE** ARGOUTVIEWM_FARRAY4, DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4)
-{
-  npy_intp dims[4] = { *$2, *$3, *$4 , *$5 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$1));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/* Typemap suite for (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4,
-                      DATA_TYPE** ARGOUTVIEWM_FARRAY4)
- */
-%typemap(in,numinputs=0)
-  (DIM_TYPE* DIM1    , DIM_TYPE* DIM2    , DIM_TYPE* DIM3    , DIM_TYPE* DIM4    , DATA_TYPE** ARGOUTVIEWM_FARRAY4)
-  (DIM_TYPE dim1_temp, DIM_TYPE dim2_temp, DIM_TYPE dim3_temp, DIM_TYPE dim4_temp, DATA_TYPE* data_temp = NULL    )
-{
-  $1 = &dim1_temp;
-  $2 = &dim2_temp;
-  $3 = &dim3_temp;
-  $4 = &dim4_temp;
-  $5 = &data_temp;
-}
-%typemap(argout,
-         fragment="NumPy_Backward_Compatibility,NumPy_Array_Requirements,NumPy_Utilities")
-  (DIM_TYPE* DIM1, DIM_TYPE* DIM2, DIM_TYPE* DIM3, DIM_TYPE* DIM4, DATA_TYPE** ARGOUTVIEWM_FARRAY4)
-{
-  npy_intp dims[4] = { *$1, *$2, *$3 , *$4 };
-  PyObject* obj = PyArray_SimpleNewFromData(4, dims, DATA_TYPECODE, (void*)(*$5));
-  PyArrayObject* array = (PyArrayObject*) obj;
-
-  if (!array || !require_fortran(array)) SWIG_fail;
-
-%#ifdef SWIGPY_USE_CAPSULE
-    PyObject* cap = PyCapsule_New((void*)(*$1), SWIGPY_CAPSULE_NAME, free_cap);
-%#else
-    PyObject* cap = PyCObject_FromVoidPtr((void*)(*$1), free);
-%#endif
-
-%#if NPY_API_VERSION < 0x00000007
-  PyArray_BASE(array) = cap;
-%#else
-  PyArray_SetBaseObject(array,cap);
-%#endif
-
-  $result = SWIG_Python_AppendOutput($result,obj);
-}
-
-/**************************************/
-/* In-Place Array Typemap - flattened */
-/**************************************/
-
-/* Typemap suite for (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
- */
-%typecheck(SWIG_TYPECHECK_DOUBLE_ARRAY,
-           fragment="NumPy_Macros")
-  (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
-{
-  $1 = is_array($input) && PyArray_EquivTypenums(array_type($input),
-                                                 DATA_TYPECODE);
-}
-%typemap(in,
-         fragment="NumPy_Fragments")
-  (DATA_TYPE* INPLACE_ARRAY_FLAT, DIM_TYPE DIM_FLAT)
-  (PyArrayObject* array=NULL, int i=1)
-{
-  array = obj_to_array_no_conversion($input, DATA_TYPECODE);
-  if (!array || !require_c_or_f_contiguous(array)
-      || !require_native(array)) SWIG_fail;
-  $1 = (DATA_TYPE*) array_data(array);
-  $2 = 1;
-  for (i=0; i < array_numdims(array); ++i) $2 *= array_size(array,i);
-}
-
-%enddef    /* %numpy_typemaps() macro */
-/* *************************************************************** */
-
-/* Concrete instances of the %numpy_typemaps() macro: Each invocation
- * below applies all of the typemaps above to the specified data type.
- */
-%numpy_typemaps(signed char       , NPY_BYTE     , int)
-%numpy_typemaps(unsigned char     , NPY_UBYTE    , int)
-%numpy_typemaps(short             , NPY_SHORT    , int)
-%numpy_typemaps(unsigned short    , NPY_USHORT   , int)
-%numpy_typemaps(int               , NPY_INT      , int)
-%numpy_typemaps(unsigned int      , NPY_UINT     , int)
-%numpy_typemaps(long              , NPY_LONG     , int)
-%numpy_typemaps(unsigned long     , NPY_ULONG    , int)
-%numpy_typemaps(long long         , NPY_LONGLONG , int)
-%numpy_typemaps(unsigned long long, NPY_ULONGLONG, int)
-%numpy_typemaps(float             , NPY_FLOAT    , int)
-%numpy_typemaps(double            , NPY_DOUBLE   , int)
-
-/* ***************************************************************
- * The follow macro expansion does not work, because C++ bool is 4
- * bytes and NPY_BOOL is 1 byte
- *
- *    %numpy_typemaps(bool, NPY_BOOL, int)
- */
-
-/* ***************************************************************
- * On my Mac, I get the following warning for this macro expansion:
- * 'swig/python detected a memory leak of type 'long double *', no destructor found.'
- *
- *    %numpy_typemaps(long double, NPY_LONGDOUBLE, int)
- */
-
-#ifdef __cplusplus
-
-%include <std_complex.i>
-
-%numpy_typemaps(std::complex<float>,  NPY_CFLOAT , int)
-%numpy_typemaps(std::complex<double>, NPY_CDOUBLE, int)
-
-#endif
-
-#endif /* SWIGPYTHON */
diff --git a/paddle/legacy/api/test/.gitignore b/paddle/legacy/api/test/.gitignore
deleted file mode 100644
index b7948824a..000000000
--- a/paddle/legacy/api/test/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.w0
-*.wbias
diff --git a/paddle/legacy/api/test/CMakeLists.txt b/paddle/legacy/api/test/CMakeLists.txt
deleted file mode 100644
index 13cb79129..000000000
--- a/paddle/legacy/api/test/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
-    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
-)
-add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
-
-py_test(testTrain SRCS testTrain.py)
-py_test(testMatrix SRCS testMatrix.py)
-py_test(testVector SRCS testVector.py)
-py_test(testTrainer SRCS testTrainer.py)
-py_test(testArguments SRCS testArguments.py)
-py_test(testGradientMachine SRCS testGradientMachine.py)
diff --git a/paddle/legacy/api/test/testArguments.py b/paddle/legacy/api/test/testArguments.py
deleted file mode 100644
index 4d40ffec9..000000000
--- a/paddle/legacy/api/test/testArguments.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import numpy as np
-import unittest
-
-
-class TestArguments(unittest.TestCase):
-    def test_load_arguments(self):
-        m = swig_paddle.Matrix.createDense([4, 2, 4, 3, 9, 5], 2, 3)
-        args = swig_paddle.Arguments.createArguments(1)
-        args.setSlotValue(0, m)
-
-        self.assertAlmostEqual(27.0, args.sum())
-
-        mat = args.getSlotValue(0)
-        assert isinstance(mat, swig_paddle.Matrix)
-        np_mat = mat.toNumpyMatInplace()
-        # The matrix unittest is in testMatrix.py
-        self.assertEqual(np_mat.shape, (2, 3))
-
-        args.setSlotIds(0, swig_paddle.IVector.create([1, 2, 3, 4, 5, 6]))
-        iv = args.getSlotIds(0)
-        assert isinstance(iv, swig_paddle.IVector)
-        np_arr = iv.toNumpyArrayInplace()
-        self.assertEqual(np_arr.shape, (6, ))
-
-    def test_arguments_shape(self):
-        h, w = 4, 6
-        v = np.random.rand(2, h * w)
-        m = swig_paddle.Matrix.createDense(v.flatten(), 2, h * w)
-        args = swig_paddle.Arguments.createArguments(1)
-        args.setSlotValue(0, m)
-        args.setSlotFrameHeight(0, h)
-        args.setSlotFrameWidth(0, w)
-        self.assertEqual(args.getSlotFrameHeight(), h)
-        self.assertEqual(args.getSlotFrameWidth(), w)
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0")
-    unittest.main()
diff --git a/paddle/legacy/api/test/testGradientMachine.py b/paddle/legacy/api/test/testGradientMachine.py
deleted file mode 100644
index 4b705f66e..000000000
--- a/paddle/legacy/api/test/testGradientMachine.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import paddle.proto.ParameterConfig_pb2
-import util
-import unittest
-import numpy
-
-
-class TestGradientMachine(unittest.TestCase):
-    def test_create_gradient_machine(self):
-        conf_file_path = "./testTrainConfig.py"
-        trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
-            conf_file_path)
-        self.assertIsNotNone(trainer_config)
-        opt_config = trainer_config.getOptimizationConfig()
-        model_config = trainer_config.getModelConfig()
-        self.assertIsNotNone(model_config)
-        machine = swig_paddle.GradientMachine.createByModelConfig(
-            model_config, swig_paddle.CREATE_MODE_NORMAL,
-            swig_paddle.ParameterOptimizer.create(opt_config).getParameterTypes(
-            ))
-        self.assertIsNotNone(machine)
-        ipt, _ = util.loadMNISTTrainData()
-        output = swig_paddle.Arguments.createArguments(0)
-
-        optimizers = {}
-
-        # Initial Machine Parameter all to 0.1
-        for param in machine.getParameters():
-            assert isinstance(param, swig_paddle.Parameter)
-            val = param.getBuf(swig_paddle.PARAMETER_VALUE)
-            assert isinstance(val, swig_paddle.Vector)
-            arr = numpy.full((len(val), ), 0.1, dtype="float32")
-            val.copyFromNumpyArray(arr)
-            self.assertTrue(param.save(param.getName()))
-            param_config = param.getConfig().toProto()
-            assert isinstance(param_config,
-                              paddle.proto.ParameterConfig_pb2.ParameterConfig)
-            opt = swig_paddle.ParameterOptimizer.create(opt_config)
-            optimizers[param.getID()] = opt
-            num_rows = param_config.dims[1]
-            opt.init(num_rows, param.getConfig())
-
-        for k in optimizers:
-            opt = optimizers[k]
-            opt.startPass()
-
-        batch_size = ipt.getSlotValue(0).getHeight()
-        for k in optimizers:
-            opt = optimizers[k]
-            opt.startBatch(batch_size)
-
-        machine.forward(ipt, output, swig_paddle.PASS_TRAIN)
-        self.assertEqual(1, output.getSlotNum())
-        self.isCalled = False
-
-        def backward_callback(param_):
-            self.isCalled = isinstance(param_, swig_paddle.Parameter)
-            assert isinstance(param_, swig_paddle.Parameter)
-            vec = param_.getBuf(swig_paddle.PARAMETER_VALUE)
-            assert isinstance(vec, swig_paddle.Vector)
-            vec = vec.copyToNumpyArray()
-            for val_ in vec:
-                self.assertTrue(
-                    util.doubleEqual(val_, 0.1))  # Assert All Value is 0.1
-
-            vecs = list(param_.getBufs())
-            opt_ = optimizers[param_.getID()]
-            opt_.update(vecs, param_.getConfig())
-
-        machine.backward(backward_callback)
-
-        for k in optimizers:
-            opt = optimizers[k]
-            opt.finishBatch()
-
-        for k in optimizers:
-            opt = optimizers[k]
-            opt.finishPass()
-
-        self.assertTrue(self.isCalled)
-
-        for param in machine.getParameters():
-            self.assertTrue(param.load(param.getName()))
-
-    def test_train_one_pass(self):
-        conf_file_path = './testTrainConfig.py'
-        trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
-            conf_file_path)
-        model_config = trainer_config.getModelConfig()
-        machine = swig_paddle.GradientMachine.createByModelConfig(model_config)
-
-        at_end = False
-
-        output = swig_paddle.Arguments.createArguments(0)
-        if not at_end:
-            input_, at_end = util.loadMNISTTrainData(1000)
-            machine.forwardBackward(input_, output, swig_paddle.PASS_TRAIN)
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle('--use_gpu=0')
-    unittest.main()
diff --git a/paddle/legacy/api/test/testMatrix.py b/paddle/legacy/api/test/testMatrix.py
deleted file mode 100644
index f08fbf3cc..000000000
--- a/paddle/legacy/api/test/testMatrix.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import numpy as np
-import unittest
-
-
-class TestMatrix(unittest.TestCase):
-    def test_createZero_get_set(self):
-        m = swig_paddle.Matrix.createZero(32, 24)
-        self.assertEqual(m.getWidth(), 24)
-        self.assertEqual(m.getHeight(), 32)
-        for x in xrange(24):
-            for y in xrange(32):
-                self.assertEqual(0.0, m.get(x, y))
-        with self.assertRaises(swig_paddle.RangeError):
-            m.get(51, 47)
-        m.set(3, 3, 3.0)
-        self.assertEqual(m.get(3, 3), 3.0)
-
-    def test_sparse(self):
-        m = swig_paddle.Matrix.createSparse(3, 3, 6, True, False, False)
-        self.assertIsNotNone(m)
-        self.assertTrue(m.isSparse())
-        self.assertEqual(m.getSparseValueType(), swig_paddle.SPARSE_NON_VALUE)
-        self.assertEqual(m.getSparseFormat(), swig_paddle.SPARSE_CSR)
-        m.sparseCopyFrom([0, 2, 3, 3], [0, 1, 2], [])
-        self.assertEqual(m.getSparseRowCols(0), [0, 1])
-        self.assertEqual(m.getSparseRowCols(1), [2])
-        self.assertEqual(m.getSparseRowCols(2), [])
-
-    def test_sparse_value(self):
-        m = swig_paddle.Matrix.createSparse(3, 3, 6, False, False, False)
-        self.assertIsNotNone(m)
-        m.sparseCopyFrom([0, 2, 3, 3], [0, 1, 2], [7.3, 4.2, 3.2])
-
-        def assertKVArraySame(actual, expect):
-            self.assertEqual(len(actual), len(expect))
-            for i in xrange(len(actual)):
-                a = actual[i]
-                e = expect[i]
-                self.assertIsInstance(a, tuple)
-                self.assertIsInstance(e, tuple)
-                self.assertEqual(len(a), 2)
-                self.assertEqual(len(e), 2)
-                self.assertEqual(a[0], e[0])
-                self.assertTrue(abs(a[1] - e[1]) < 1e-5)
-
-        first_row = m.getSparseRowColsVal(0)
-        assertKVArraySame(first_row, [(0, 7.3), (1, 4.2)])
-
-    def test_createDenseMat(self):
-        m = swig_paddle.Matrix.createDense([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], 2, 3)
-        self.assertIsNotNone(m)
-        self.assertTrue(abs(m.get(1, 1) - 0.5) < 1e-5)
-
-    def test_numpyCpu(self):
-        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, False)
-        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
-                         numpy_mat.shape)
-
-        # the numpy matrix and paddle matrix shared the same memory.
-        numpy_mat[0, 1] = 342.23
-
-        for h in xrange(m.getHeight()):
-            for w in xrange(m.getWidth()):
-                self.assertEqual(m.get(h, w), numpy_mat[h, w])
-
-        mat2 = m.toNumpyMatInplace()
-        mat2[1, 1] = 32.2
-        self.assertTrue(np.array_equal(mat2, numpy_mat))
-
-    def test_numpyGpu(self):
-        if swig_paddle.isGpuVersion():
-            numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype='float32')
-            gpu_m = swig_paddle.Matrix.createGpuDenseFromNumpy(numpy_mat)
-            assert isinstance(gpu_m, swig_paddle.Matrix)
-            self.assertEqual((int(gpu_m.getHeight()), int(gpu_m.getWidth())),
-                             numpy_mat.shape)
-            self.assertTrue(gpu_m.isGpu())
-            numpy_mat = gpu_m.copyToNumpyMat()
-            numpy_mat[0, 1] = 3.23
-            for a, e in zip(gpu_m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
-                self.assertAlmostEqual(a, e)
-
-            gpu_m.copyFromNumpyMat(numpy_mat)
-
-            for a, e in zip(gpu_m.getData(), [1.0, 3.23, 3.0, 4.0, 5.0, 6.0]):
-                self.assertAlmostEqual(a, e)
-
-    def test_numpy(self):
-        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createDenseFromNumpy(numpy_mat)
-        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
-                         numpy_mat.shape)
-        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
-        for a, e in zip(m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
-            self.assertAlmostEqual(a, e)
-
-
-if __name__ == "__main__":
-    swig_paddle.initPaddle("--use_gpu=0")
-    suite = unittest.TestLoader().loadTestsFromTestCase(TestMatrix)
-    unittest.TextTestRunner().run(suite)
-    if swig_paddle.isGpuVersion():
-        swig_paddle.setUseGpu(True)
-        unittest.main()
diff --git a/paddle/legacy/api/test/testTrain.py b/paddle/legacy/api/test/testTrain.py
deleted file mode 100644
index 7061a4c43..000000000
--- a/paddle/legacy/api/test/testTrain.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import paddle.trainer.config_parser
-import numpy
-import util
-
-
-def init_params(params):
-    def init_param(p):
-        assert isinstance(p, swig_paddle.Parameter)
-        val = p.getBuf(swig_paddle.PARAMETER_VALUE)
-        assert isinstance(val, swig_paddle.Vector)
-        arr = val.toNumpyArrayInplace()
-        for i in xrange(len(arr)):
-            arr[i] = numpy.random.uniform(-1.0, 1.0)
-
-    for p in params:
-        init_param(p)
-
-
-def init_optimizers(opt_conf, params):
-    opts = {}
-    for param in params:
-        param_conf = param.getConfig().toProto()
-        opts[param.getID()] = swig_paddle.ParameterOptimizer.create(opt_conf)
-        opts[param.getID()].init(param_conf.dims[1], param.getConfig())
-    retv_opts = [None for _ in xrange(len(opts))]
-    for k in opts:
-        assert k < len(retv_opts)
-        retv_opts[k] = opts[k]
-    return retv_opts
-
-
-def main():
-    trainer_config = paddle.trainer.config_parser.parse_config(
-        "./testTrainConfig.py", "")
-    opt_config = trainer_config.opt_config
-    print "========Optimization Config ======="
-    print opt_config
-    print "==================================="
-    opt_config = swig_paddle.OptimizationConfig.createFromProto(opt_config)
-    _temp_optimizer_ = swig_paddle.ParameterOptimizer.create(opt_config)
-    enable_types = _temp_optimizer_.getParameterTypes()
-    m = swig_paddle.GradientMachine.createFromConfigProto(
-        trainer_config.model_config, swig_paddle.CREATE_MODE_NORMAL,
-        enable_types)
-    assert m is not None
-    assert isinstance(m, swig_paddle.GradientMachine)
-    init_params(m.getParameters())
-
-    optimizers = init_optimizers(opt_config, m.getParameters())
-
-    # Train One Pass.
-    for optimizer in optimizers:
-        optimizer.startPass()
-    batch_id = 0
-    while True:  # Train one batch
-        batch_size = 1000
-        inArgs, atEnd = util.loadMNISTTrainData(batch_size)
-        if atEnd:
-            break
-        outArgs = swig_paddle.Arguments.createArguments(0)
-
-        for optimizer in optimizers:
-            optimizer.startBatch(batch_size)
-
-        def update_callback(param):
-            try:
-                bufs = list(param.getBufs())
-                opt = optimizers[param.getID()]
-                opt.update(bufs, param.getConfig())
-                callback = opt.needSpecialTraversal(param.getConfig())
-                if callback is not None:
-                    callback(bufs, param.getConfig(), swig_paddle.NO_SPARSE_ID)
-
-            except Exception as e:
-                print e
-
-        ev = m.makeEvaluator()
-        ev.start()
-        m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN,
-                          update_callback)
-        m.eval(ev)
-        ev.finish()
-        for name in ev.getNames():
-            print name, ev.getValue(name)
-        for optimizer in optimizers:
-            optimizer.finishBatch()
-
-        cost_vec = outArgs.getSlotValue(0)
-        assert isinstance(cost_vec, swig_paddle.Matrix)
-        cost_vec = cost_vec.copyToNumpyMat()
-        print 'Finish Batch', batch_id, 'with cost ', cost_vec.sum(
-        ) / batch_size
-        batch_id += 1
-
-    for optimizer in optimizers:
-        optimizer.finishPass()
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
-    main()
diff --git a/paddle/legacy/api/test/testTrainConfig.py b/paddle/legacy/api/test/testTrainConfig.py
deleted file mode 100644
index c02d61eba..000000000
--- a/paddle/legacy/api/test/testTrainConfig.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=100, learning_method=AdamOptimizer())
-
-din = data_layer(name='input', size=784)
-
-fc1 = fc_layer(name='hidden1', input=din, size=100)
-fc2 = fc_layer(name='hidden2', input=fc1, size=100)
-
-opt = fc_layer(input=fc2, size=10, act=SoftmaxActivation())
-outputs(classification_cost(input=opt, label=data_layer('lbl', 10)))
diff --git a/paddle/legacy/api/test/testTrainer.py b/paddle/legacy/api/test/testTrainer.py
deleted file mode 100644
index a76cbf02d..000000000
--- a/paddle/legacy/api/test/testTrainer.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config
-from paddle.trainer.config_parser import logger
-from py_paddle import swig_paddle
-import util
-
-
-def main():
-    trainer_config = parse_config("./testTrainConfig.py", "")
-    model = swig_paddle.GradientMachine.createFromConfigProto(
-        trainer_config.model_config)
-    trainer = swig_paddle.Trainer.create(trainer_config, model)
-    trainer.startTrain()
-    for train_pass in xrange(2):
-        trainer.startTrainPass()
-        num = 0
-        cost = 0
-        while True:  # Train one batch
-            batch_size = 1000
-            data, atEnd = util.loadMNISTTrainData(batch_size)
-            if atEnd:
-                break
-            trainer.trainOneDataBatch(batch_size, data)
-            outs = trainer.getForwardOutput()
-            cost += sum(outs[0]['value'])
-            num += batch_size
-        trainer.finishTrainPass()
-        logger.info('train cost=%f' % (cost / num))
-
-        trainer.startTestPeriod()
-        num = 0
-        cost = 0
-        while True:  # Test one batch
-            batch_size = 1000
-            data, atEnd = util.loadMNISTTrainData(batch_size)
-            if atEnd:
-                break
-            trainer.testOneDataBatch(batch_size, data)
-            outs = trainer.getForwardOutput()
-            cost += sum(outs[0]['value'])
-            num += batch_size
-        trainer.finishTestPeriod()
-        logger.info('test cost=%f' % (cost / num))
-
-    trainer.finishTrain()
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
-    main()
diff --git a/paddle/legacy/api/test/testVector.py b/paddle/legacy/api/test/testVector.py
deleted file mode 100644
index 6339cf854..000000000
--- a/paddle/legacy/api/test/testVector.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle
-import util
-import numpy as np
-import unittest
-
-
-class TestIVector(unittest.TestCase):
-    def test_createZero(self):
-        m = swig_paddle.IVector.createZero(10, False)
-        self.assertIsNotNone(m)
-        for i in xrange(10):
-            self.assertEqual(m[i], 0)
-            m[i] = i
-            self.assertEqual(m[i], i)
-
-        m = swig_paddle.IVector.createZero(10)
-        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(m.getData(), [0] * 10)
-
-    def test_create(self):
-        m = swig_paddle.IVector.create(range(10), False)
-        self.assertIsNotNone(m)
-        for i in xrange(10):
-            self.assertEqual(m[i], i)
-
-        m = swig_paddle.IVector.create(range(10))
-        self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(m.getData(), range(10))
-
-    def test_cpu_numpy(self):
-        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, False)
-        self.assertEqual(vec.shape[0], int(iv.__len__()))
-        vec[4] = 832
-        for i in xrange(len(iv)):
-            self.assertEqual(vec[i], iv[i])
-        vec2 = iv.toNumpyArrayInplace()
-        vec2[1] = 384
-        for i in xrange(len(iv)):
-            self.assertEqual(vec[i], iv[i])
-            self.assertEqual(vec2[i], iv[i])
-
-    def test_gpu_numpy(self):
-        if swig_paddle.isGpuVersion():
-            vec = swig_paddle.IVector.create(range(0, 10), True)
-            assert isinstance(vec, swig_paddle.IVector)
-            self.assertTrue(vec.isGpu())
-            self.assertEqual(vec.getData(), range(0, 10))
-            num_arr = vec.copyToNumpyArray()
-            assert isinstance(num_arr, np.ndarray)  # for code hint.
-            num_arr[4] = 7
-            self.assertEquals(vec.getData(), range(0, 10))
-
-            vec.copyFromNumpyArray(num_arr)
-            expect_vec = range(0, 10)
-            expect_vec[4] = 7
-            self.assertEqual(vec.getData(), expect_vec)
-
-    def test_numpy(self):
-        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createVectorFromNumpy(vec)
-        self.assertEqual(iv.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(iv.getData(), list(vec))
-
-
-class TestVector(unittest.TestCase):
-    def testCreateZero(self):
-        v = swig_paddle.Vector.createZero(10, False)
-        self.assertIsNotNone(v)
-        for i in xrange(len(v)):
-            self.assertTrue(util.doubleEqual(v[i], 0))
-            v[i] = i
-            self.assertTrue(util.doubleEqual(v[i], i))
-
-        v = swig_paddle.Vector.createZero(10)
-        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(v.getData(), [0] * 10)
-
-    def testCreate(self):
-        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)], False)
-        self.assertIsNotNone(v)
-        for i in xrange(len(v)):
-            self.assertTrue(util.doubleEqual(v[i], i / 100.0))
-        self.assertEqual(100, len(v))
-
-        v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
-        self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(100, len(v))
-        vdata = v.getData()
-        for i in xrange(len(v)):
-            self.assertTrue(util.doubleEqual(vdata[i], i / 100.0))
-
-    def testCpuNumpy(self):
-        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, False)
-        assert isinstance(vec, swig_paddle.Vector)
-        numpy_arr[0] = 0.1
-        for n, v in zip(numpy_arr, vec):
-            self.assertTrue(util.doubleEqual(n, v))
-
-        numpy_2 = vec.toNumpyArrayInplace()
-        vec[0] = 1.3
-        for x, y in zip(numpy_arr, numpy_2):
-            self.assertTrue(util.doubleEqual(x, y))
-
-        for x, y in zip(numpy_arr, vec):
-            self.assertTrue(util.doubleEqual(x, y))
-
-        numpy_3 = vec.copyToNumpyArray()
-        numpy_3[0] = 0.4
-        self.assertTrue(util.doubleEqual(vec[0], 1.3))
-        self.assertTrue(util.doubleEqual(numpy_3[0], 0.4))
-
-        for i in xrange(1, len(numpy_3)):
-            util.doubleEqual(numpy_3[i], vec[i])
-
-    def testNumpy(self):
-        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createVectorFromNumpy(numpy_arr)
-        self.assertEqual(vec.isGpu(), swig_paddle.isUsingGpu())
-        vecData = vec.getData()
-        for n, v in zip(numpy_arr, vecData):
-            self.assertTrue(util.doubleEqual(n, v))
-
-    def testCopyFromNumpy(self):
-        vec = swig_paddle.Vector.createZero(1, False)
-        arr = np.array([1.3, 3.2, 2.4], dtype="float32")
-        vec.copyFromNumpyArray(arr)
-        for i in xrange(len(vec)):
-            self.assertTrue(util.doubleEqual(vec[i], arr[i]))
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0")
-    suite = unittest.TestLoader().loadTestsFromTestCase(TestVector)
-    unittest.TextTestRunner().run(suite)
-    if swig_paddle.isGpuVersion():
-        swig_paddle.setUseGpu(True)
-        unittest.main()
diff --git a/paddle/legacy/api/test/util.py b/paddle/legacy/api/test/util.py
deleted file mode 100644
index 9f4631c53..000000000
--- a/paddle/legacy/api/test/util.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-
-import numpy as np
-from py_paddle import swig_paddle
-
-
-def doubleEqual(a, b):
-    return abs(a - b) < 1e-5
-
-
-def __readFromFile():
-    for i in xrange(10002):
-        label = np.random.randint(0, 9)
-        sample = np.random.rand(784) + 0.1 * label
-        yield sample, label
-
-
-def loadMNISTTrainData(batch_size=100):
-    if not hasattr(loadMNISTTrainData, "gen"):
-        generator = __readFromFile()
-        loadMNISTTrainData.gen = generator
-    else:
-        generator = loadMNISTTrainData.gen
-    args = swig_paddle.Arguments.createArguments(2)
-    # batch_size = 100
-
-    dense_slot = []
-    id_slot = []
-    atEnd = False
-
-    for _ in xrange(batch_size):
-        try:
-            result = generator.next()
-            dense_slot.extend(result[0])
-            id_slot.append(result[1])
-        except StopIteration:
-            atEnd = True
-            del loadMNISTTrainData.gen
-            break
-
-    dense_slot = swig_paddle.Matrix.createDense(dense_slot, batch_size, 784)
-    id_slot = swig_paddle.IVector.create(id_slot)
-    args.setSlotValue(0, dense_slot)
-    args.setSlotIds(1, id_slot)
-    return args, atEnd
diff --git a/paddle/legacy/capi/Arguments.cpp b/paddle/legacy/capi/Arguments.cpp
deleted file mode 100644
index 0ce1770c7..000000000
--- a/paddle/legacy/capi/Arguments.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "arguments.h"
-#include "capi_private.h"
-
-using paddle::capi::cast;
-
-#define castArg(v) cast<paddle::capi::CArguments>(v)
-#define castIVec(v) cast<paddle::capi::CIVector>(v)
-
-extern "C" {
-paddle_arguments paddle_arguments_create_none() {
-  return new paddle::capi::CArguments();
-}
-
-paddle_error paddle_arguments_destroy(paddle_arguments args) {
-  if (args == nullptr) return kPD_NULLPTR;
-  delete castArg(args);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_get_size(paddle_arguments args, uint64_t* size) {
-  if (args == nullptr || size == nullptr) return kPD_NULLPTR;
-  *size = castArg(args)->args.size();
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_resize(paddle_arguments args, uint64_t size) {
-  if (args == nullptr) return kPD_NULLPTR;
-  castArg(args)->args.resize(size);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_set_value(paddle_arguments args,
-                                        uint64_t ID,
-                                        paddle_matrix mat) {
-  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
-  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
-  if (m->mat == nullptr) return kPD_NULLPTR;
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  a->args[ID].value = m->mat;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_get_value(paddle_arguments args,
-                                        uint64_t ID,
-                                        paddle_matrix mat) {
-  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
-  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  m->mat = a->args[ID].value;
-  return kPD_NO_ERROR;
-}
-
-PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
-                                              uint64_t ID,
-                                              paddle_matrix mat) {
-  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
-  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  m->mat = a->args[ID].in;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_get_ids(paddle_arguments args,
-                                      uint64_t ID,
-                                      paddle_ivector ids) {
-  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
-  auto iv = castIVec(ids);
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  iv->vec = a->args[ID].ids;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_set_ids(paddle_arguments args,
-                                      uint64_t ID,
-                                      paddle_ivector ids) {
-  //! TODO(lizhao): Complete this method.
-  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
-  auto iv = paddle::capi::cast<paddle::capi::CIVector>(ids);
-  if (iv->vec == nullptr) return kPD_NULLPTR;
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  a->args[ID].ids = iv->vec;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
-                                              uint64_t ID,
-                                              uint64_t frameHeight,
-                                              uint64_t frameWidth) {
-  if (args == nullptr) return kPD_NULLPTR;
-  auto a = castArg(args);
-  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
-  a->args[ID].setFrameHeight(frameHeight);
-  a->args[ID].setFrameWidth(frameWidth);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
-                                                     uint64_t ID,
-                                                     uint32_t nestedLevel,
-                                                     paddle_ivector seqPos) {
-  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
-  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
-  if (iv->vec == nullptr) return kPD_NULLPTR;
-  auto a = castArg(args);
-  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
-    ptr = std::make_shared<paddle::ICpuGpuVector>(iv->vec);
-  });
-}
-
-paddle_error paddle_arguments_get_sequence_start_pos(paddle_arguments args,
-                                                     uint64_t ID,
-                                                     uint32_t nestedLevel,
-                                                     paddle_ivector seqPos) {
-  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
-  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
-  auto a = castArg(args);
-  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
-    iv->vec = ptr->getMutableVector(false);
-  });
-}
-}
diff --git a/paddle/legacy/capi/CMakeLists.txt b/paddle/legacy/capi/CMakeLists.txt
deleted file mode 100644
index 957b1a3e6..000000000
--- a/paddle/legacy/capi/CMakeLists.txt
+++ /dev/null
@@ -1,118 +0,0 @@
-if (WITH_DOUBLE)
-  set(PADDLE_FLOAT_TYPE double)
-else ()
-  set(PADDLE_FLOAT_TYPE float)
-endif()
-
-execute_process(
-  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
-  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
-  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT PADDLE_GIT_COMMIT)
-  set(PADDLE_GIT_COMMIT "no commit information")
-endif()
-
-# config.h used for C-API. It will store Paddle building configuration as a
-# header. Make user just include PaddleCAPI.h then can get building
-# configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
-# libraries.
-configure_file(config.h.in config.h @ONLY)
-
-# PaddleCAPI.h is the only header we exposed. It currently only used for model
-# inference.
-file(GLOB CAPI_HEADERS *.h)
-set(CAPI_PRIVATE_HEADER capi_private.h)
-list(REMOVE_ITEM CAPI_HEADERS ${CAPI_PRIVATE_HEADER})
-file(GLOB CAPI_SOURCES *.cpp)
-
-# building paddle_capi
-add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
-  ${CAPI_SOURCES})
-
-target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-
-add_dependencies(paddle_capi paddle_proto paddle_gserver)
-
-# TODO: paddle_capi_whole will be removed.
-set(PADDLE_CAPI_LAYERS_LIBS
-    paddle_function
-    paddle_gserver)
-if(MOBILE_INFERENCE)
-  set(PADDLE_CAPI_ENGINE_LIBS
-      paddle_utils
-      paddle_parameter
-      paddle_math
-      paddle_cuda
-      paddle_proto)
-else()
-  set(PADDLE_CAPI_ENGINE_LIBS
-      paddle_utils
-      paddle_parameter
-      paddle_math
-      paddle_cuda
-      paddle_proto
-      paddle_pserver
-      paddle_network)
-endif()
-set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
-cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
-
-# Link the static library for inference
-cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
-cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
-
-# Link the shared library for inference
-if(NOT IOS)
-  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
-  add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
-  set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-  link_paddle_exe(paddle_capi_shared)
-endif()
-
-# install library & headers.
-install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
-install(FILES paddle_capi.map DESTINATION include/paddle)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
-if(ANDROID)
-  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
-          ARCHIVE DESTINATION lib/${ANDROID_ABI}
-          LIBRARY DESTINATION lib/${ANDROID_ABI})
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_COMMITS_LIST
-    RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(${GIT_COMMITS_LIST_RESULT})
-    set(GIT_COMMITS_LIST "No commits.")
-  endif()
-  install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
-          \"Compiler:\n\"
-          \"\\t${CMAKE_C_COMPILER}\\n\"
-          \"\\t${CMAKE_CXX_COMPILER}\\n\"
-          \"Compiler Flags:\\n\"
-          \"\\t${CMAKE_F_FLAGS}\\n\"
-          \"\\t${CMAKE_CXX_FLAGS}\\n\"
-          \"Android API: ${CMAKE_SYSTEM_VERSION}\\n\"
-          \"Lastest commit:\\n\"
-          \"\\t${GIT_COMMITS_LIST}\\n\"
-      )"
-  )
-else(ANDROID)
-  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
-  if(NOT IOS)
-    install(TARGETS paddle_capi_shared DESTINATION lib)
-  endif()
-endif(ANDROID)
-
-# this variable used for unittest
-set(PADDLE_CAPI_INC_PATH
-  ${CMAKE_CURRENT_BINARY_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR})
-
-if (WITH_TESTING)
-  add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/capi/Main.cpp b/paddle/legacy/capi/Main.cpp
deleted file mode 100644
index 17d8f00a8..000000000
--- a/paddle/legacy/capi/Main.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fenv.h>
-#include <stdlib.h>
-#include <string.h>
-#include <vector>
-#include "capi_private.h"
-#include "main.h"
-#include "paddle/legacy/trainer/TrainerConfigHelper.h"
-#include "paddle/legacy/utils/Excepts.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-
-static void initPaddle(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-}
-
-extern "C" {
-paddle_error paddle_init(int argc, char** argv) {
-  static bool isInit = false;
-  if (isInit) return kPD_NO_ERROR;
-
-  std::vector<char*> realArgv;
-  realArgv.reserve(argc + 1);
-  realArgv.push_back(strdup(""));
-  for (int i = 0; i < argc; ++i) {
-    realArgv.push_back(argv[i]);
-  }
-  initPaddle(argc + 1, realArgv.data());
-  free(realArgv[0]);
-  isInit = true;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_init_thread() {
-  if (FLAGS_use_gpu) {
-    hl_init(FLAGS_gpu_id);
-  }
-  return kPD_NO_ERROR;
-}
-}
diff --git a/paddle/legacy/capi/Matrix.cpp b/paddle/legacy/capi/Matrix.cpp
deleted file mode 100644
index 733d49cac..000000000
--- a/paddle/legacy/capi/Matrix.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi_private.h"
-#include "hl_cuda.h"
-#include "matrix.h"
-
-#define cast(v) paddle::capi::cast<paddle::capi::CMatrix>(v)
-extern "C" {
-paddle_matrix paddle_matrix_create(uint64_t height,
-                                   uint64_t width,
-                                   bool useGpu) {
-  auto ptr = new paddle::capi::CMatrix();
-  ptr->mat = paddle::Matrix::create(height, width, false, useGpu);
-  return ptr;
-}
-
-paddle_matrix paddle_matrix_create_none() {
-  return new paddle::capi::CMatrix();
-}
-
-paddle_error paddle_matrix_destroy(paddle_matrix mat) {
-  if (mat == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  delete ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_matrix_set_row(paddle_matrix mat,
-                                   uint64_t rowID,
-                                   paddle_real* rowArray) {
-  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
-  paddle::real* buf = ptr->mat->getRowBuf(rowID);
-  size_t width = ptr->mat->getWidth();
-#ifdef PADDLE_WITH_CUDA
-  hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
-#else
-  std::copy(rowArray, rowArray + width, buf);
-#endif
-  return kPD_NO_ERROR;
-}
-
-PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                            paddle_real* value) {
-  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  paddle::real* buf = ptr->mat->getRowBuf(0);
-  size_t width = ptr->mat->getWidth();
-  size_t height = ptr->mat->getHeight();
-  if (ptr->mat->useGpu()) {
-#ifdef PADDLE_WITH_CUDA
-    hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
-#else
-    return kPD_NOT_SUPPORTED;
-#endif
-  } else {
-    std::copy(value, value + width * height, buf);
-  }
-  return kPD_NO_ERROR;
-}
-
-PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                            paddle_real* result) {
-  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  paddle::real* buf = ptr->mat->getRowBuf(0);
-  size_t width = ptr->mat->getWidth();
-  size_t height = ptr->mat->getHeight();
-  if (ptr->mat->useGpu()) {
-#ifdef PADDLE_WITH_CUDA
-    hl_memcpy(result, buf, width * height * sizeof(paddle::real));
-#else
-    return kPD_NOT_SUPPORTED;
-#endif
-  } else {
-    std::copy(buf, buf + width * height, result);
-  }
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_matrix_get_row(paddle_matrix mat,
-                                   uint64_t rowID,
-                                   paddle_real** rawRowBuffer) {
-  if (mat == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
-  *rawRowBuffer = ptr->mat->getRowBuf(rowID);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_matrix_get_shape(paddle_matrix mat,
-                                     uint64_t* height,
-                                     uint64_t* width) {
-  if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR;
-  if (height != nullptr) {
-    *height = cast(mat)->mat->getHeight();
-  }
-  if (width != nullptr) {
-    *width = cast(mat)->mat->getWidth();
-  }
-  return kPD_NO_ERROR;
-}
-}
-
-paddle_matrix paddle_matrix_create_sparse(
-    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  auto ptr = new paddle::capi::CMatrix();
-  ptr->mat = paddle::Matrix::createSparseMatrix(
-      height,
-      width,
-      nnz,
-      isBinary ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      paddle::SPARSE_CSR,
-      false,
-      useGpu);
-  return ptr;
-#else
-  return nullptr;
-#endif
-}
-
-paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
-                                            int* rowArray,
-                                            uint64_t rowSize,
-                                            int* colArray,
-                                            uint64_t colSize,
-                                            float* valueArray,
-                                            uint64_t valueSize) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (mat == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (rowArray == nullptr || colArray == nullptr ||
-      (valueSize != 0 && valueArray == nullptr) || ptr->mat == nullptr) {
-    return kPD_NULLPTR;
-  }
-  if (auto sparseMat = dynamic_cast<paddle::CpuSparseMatrix*>(ptr->mat.get())) {
-    std::vector<int> row(rowSize);
-    row.assign(rowArray, rowArray + rowSize);
-    std::vector<int> col(colSize);
-    col.assign(colArray, colArray + colSize);
-    std::vector<paddle_real> val(valueSize);
-    if (valueSize) {
-      val.assign(valueArray, valueArray + valueSize);
-    }
-    sparseMat->copyFrom(row, col, val);
-    return kPD_NO_ERROR;
-  } else {
-    return kPD_NOT_SUPPORTED;
-  }
-#else
-  return kPD_NOT_SUPPORTED;
-#endif
-}
diff --git a/paddle/legacy/capi/Vector.cpp b/paddle/legacy/capi/Vector.cpp
deleted file mode 100644
index afb5a9afe..000000000
--- a/paddle/legacy/capi/Vector.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi_private.h"
-#include "vector.h"
-
-using paddle::capi::cast;
-
-extern "C" {
-
-paddle_ivector paddle_ivector_create_none() {
-  return new paddle::capi::CIVector();
-}
-
-paddle_ivector paddle_ivector_create(int* array,
-                                     uint64_t size,
-                                     bool copy,
-                                     bool useGPU) {
-  auto ptr = new paddle::capi::CIVector();
-  if (copy) {
-    ptr->vec = paddle::IVector::create(size, useGPU);
-    ptr->vec->copyFrom(array, size);
-  } else {
-    ptr->vec = paddle::IVector::create(array, size, useGPU);
-  }
-  return ptr;
-}
-
-paddle_error paddle_ivector_destroy(paddle_ivector ivec) {
-  if (ivec == nullptr) return kPD_NULLPTR;
-  delete cast<paddle::capi::CIVector>(ivec);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer) {
-  if (ivec == nullptr || buffer == nullptr) return kPD_NULLPTR;
-  auto v = cast<paddle::capi::CIVector>(ivec);
-  if (v->vec == nullptr) return kPD_NULLPTR;
-  *buffer = v->vec->getData();
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size) {
-  if (ivec == nullptr) return kPD_NULLPTR;
-  auto v = cast<paddle::capi::CIVector>(ivec);
-  if (v->vec == nullptr) return kPD_NULLPTR;
-  v->vec->resize(size);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_ivector_get_size(paddle_ivector ivec, uint64_t* size) {
-  if (ivec == nullptr) return kPD_NULLPTR;
-  auto v = cast<paddle::capi::CIVector>(ivec);
-  if (v->vec == nullptr) return kPD_NULLPTR;
-  *size = v->vec->getSize();
-  return kPD_NO_ERROR;
-}
-}
diff --git a/paddle/legacy/capi/arguments.h b/paddle/legacy/capi/arguments.h
deleted file mode 100644
index ceb64ee6a..000000000
--- a/paddle/legacy/capi/arguments.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_ARGUMENTS_H__
-#define __PADDLE_CAPI_ARGUMENTS_H__
-
-#include <stdint.h>
-#include "config.h"
-#include "error.h"
-#include "matrix.h"
-#include "vector.h"
-
-/**
- * Arguments functions. Each argument means layer output. Arguments means a
- * array of arguemnt.
- */
-typedef void* paddle_arguments;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * @brief paddle_arguments_create_none Create a array of arguments, which size
- * is zero.
- * @return Arguemnts
- */
-PD_API paddle_arguments paddle_arguments_create_none();
-
-/**
- * @brief paddle_arguments_destroy Destroy the arguments
- * @param args arguments to destroy
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_destroy(paddle_arguments args);
-
-/**
- * @brief paddle_arguments_get_size Get size of arguments array
- * @param [in] args arguments array
- * @param [out] size array size
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_get_size(paddle_arguments args,
-                                              uint64_t* size);
-
-/**
- * @brief PDArgsResize Resize a arguments array.
- * @param args arguments array.
- * @param size target size of array
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_resize(paddle_arguments args,
-                                            uint64_t size);
-
-/**
- * @brief PDArgsSetValue Set value matrix of one argument in array, which index
- *        is `ID`.
- * @param args arguments array
- * @param ID array index
- * @param mat matrix pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_set_value(paddle_arguments args,
-                                               uint64_t ID,
-                                               paddle_matrix mat);
-
-/**
- * @brief PDArgsGetValue Get value matrix of one argument in array, which index
- *        is `ID`.
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [out] mat matrix pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_get_value(paddle_arguments args,
-                                               uint64_t ID,
-                                               paddle_matrix mat);
-
-/**
- * @brief paddle_arguments_get_prob Get the prob matrix of beam search, which
- *        slot ID is `ID`
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [out] mat matrix pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_get_prob(paddle_arguments args,
-                                              uint64_t ID,
-                                              paddle_matrix mat);
-
-/**
- * @brief PDArgsGetIds Get the integer vector of one argument in array, which
- *        index is `ID`.
- * @param args arguments array
- * @param ID array index
- * @param ids integer vector pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_get_ids(paddle_arguments args,
-                                             uint64_t ID,
-                                             paddle_ivector ids);
-
-/**
- * @brief PDArgsSetIds Set the integer vector of one argument in array, which
- *        index is `ID`.
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [out] ids integer vector pointer
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
-                                             uint64_t ID,
-                                             paddle_ivector ids);
-
-/**
- * @brief paddle_arguments_set_frame_shape Set the fram size of one argument
- *        in array, which index is `ID`.
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [in] frameHeight maximum height of input images
- * @param [in] frameWidth maximum width of input images
- * @return paddle_error
- */
-PD_API paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
-                                                     uint64_t ID,
-                                                     uint64_t frameHeight,
-                                                     uint64_t frameWidth);
-
-/**
- * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
- *        argument in array, which index is `ID`.
- * @param args arguments array
- * @param ID array index
- * @param seqPos sequence position array.
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_arguments_set_sequence_start_pos(paddle_arguments args,
-                                        uint64_t ID,
-                                        uint32_t nestedLevel,
-                                        paddle_ivector seqPos);
-/**
- * @brief PDArgsGetSequenceStartPos Get sequence start position vector of one
- *        argument in array, which index is `ID`.
- * @param [in] args arguments array
- * @param [in] ID array index
- * @param [out] seqPos sequence position array
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_arguments_get_sequence_start_pos(paddle_arguments args,
-                                        uint64_t ID,
-                                        uint32_t nestedLevel,
-                                        paddle_ivector seqPos);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/paddle/legacy/capi/capi.h b/paddle/legacy/capi/capi.h
deleted file mode 100644
index 749fcc4b7..000000000
--- a/paddle/legacy/capi/capi.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_H__
-#define __PADDLE_CAPI_H__
-
-/**
- * Paddle C API. It will replace SWIG as Multiple Language API for model
- * training & inference. Currently it is only used in model infernece.
- *
- * NOTE: This is an experimental API, it could be changed.
- */
-#include "arguments.h"
-#include "config.h"
-#include "error.h"
-#include "gradient_machine.h"
-#include "main.h"
-#include "matrix.h"
-#include "vector.h"
-
-#endif  // PADDLECAPI_H_
diff --git a/paddle/legacy/capi/capi_private.h b/paddle/legacy/capi/capi_private.h
deleted file mode 100644
index e5f8c8c5c..000000000
--- a/paddle/legacy/capi/capi_private.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Argument.h"
-#pragma once
-
-namespace paddle {
-namespace capi {
-
-enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
-
-#define STRUCT_HEADER CType type;
-
-struct CHeader {
-  STRUCT_HEADER
-};
-
-struct CIVector {
-  STRUCT_HEADER
-  IVectorPtr vec;
-
-  CIVector() : type(kIVECTOR) {}
-};
-
-struct CMatrix {
-  STRUCT_HEADER
-  MatrixPtr mat;
-
-  CMatrix() : type(kMATRIX) {}
-};
-
-struct CArguments {
-  STRUCT_HEADER
-  std::vector<paddle::Argument> args;
-
-  CArguments() : type(kARGUMENTS) {}
-
-  template <typename T>
-  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
-    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
-    switch (nestedLevel) {
-      case 0:
-        callback(args[ID].sequenceStartPositions);
-        break;
-      case 1:
-        callback(args[ID].subSequenceStartPositions);
-        break;
-      default:
-        return kPD_OUT_OF_RANGE;
-    }
-    return kPD_NO_ERROR;
-  }
-};
-
-struct CGradientMachine {
-  STRUCT_HEADER
-  paddle::GradientMachinePtr machine;
-
-  CGradientMachine() : type(kGRADIENT_MACHINE) {}
-};
-
-template <typename T>
-inline T* cast(void* ptr) {
-  return reinterpret_cast<T*>(ptr);
-}
-}  // namespace capi
-}  // namespace paddle
diff --git a/paddle/legacy/capi/config.h.in b/paddle/legacy/capi/config.h.in
deleted file mode 100644
index 0ddbd8c75..000000000
--- a/paddle/legacy/capi/config.h.in
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
-#define __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
-
-typedef @PADDLE_FLOAT_TYPE@ paddle_real;
-
-#define __PADDLE_VERSION__  "@PADDLE_VERSION@"
-#define __PADDLE_COMMIT__   "@PADDLE_GIT_COMMIT@"
-
-// Since we only support linux and macos in compile, always use clang or
-// gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
-#define PD_API __attribute__((visibility("default")))
-
-#endif
diff --git a/paddle/legacy/capi/error.cpp b/paddle/legacy/capi/error.cpp
deleted file mode 100644
index 0c25de5ba..000000000
--- a/paddle/legacy/capi/error.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "error.h"
-
-extern "C" const char* paddle_error_string(paddle_error err) {
-  switch (err) {
-    case kPD_NULLPTR:
-      return "nullptr error";
-    case kPD_OUT_OF_RANGE:
-      return "out of range error";
-    case kPD_PROTOBUF_ERROR:
-      return "protobuf error";
-    case kPD_NOT_SUPPORTED:
-      return "not supported error";
-    case kPD_UNDEFINED_ERROR:
-      return "undefined error";
-    default:
-      return "";
-  }
-}
diff --git a/paddle/legacy/capi/error.h b/paddle/legacy/capi/error.h
deleted file mode 100644
index b0940725b..000000000
--- a/paddle/legacy/capi/error.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_ERROR_H__
-#define __PADDLE_CAPI_ERROR_H__
-
-#include "config.h"
-
-/**
- * Error Type for Paddle API.
- */
-typedef enum {
-  kPD_NO_ERROR = 0,
-  kPD_NULLPTR = 1,
-  kPD_OUT_OF_RANGE = 2,
-  kPD_PROTOBUF_ERROR = 3,
-  kPD_NOT_SUPPORTED = 4,
-  kPD_UNDEFINED_ERROR = -1,
-} paddle_error;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Error string for Paddle API.
- */
-PD_API const char* paddle_error_string(paddle_error err);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/paddle/legacy/capi/examples/.gitignore b/paddle/legacy/capi/examples/.gitignore
deleted file mode 100644
index 2caa0a5a2..000000000
--- a/paddle/legacy/capi/examples/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-*.bin
-build-*
diff --git a/paddle/legacy/capi/examples/README.md b/paddle/legacy/capi/examples/README.md
deleted file mode 100644
index 14013e281..000000000
--- a/paddle/legacy/capi/examples/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# C-API Example Usage
-
-* [Model Inference](./model_inference/README.md)
diff --git a/paddle/legacy/capi/examples/model_inference/README.md b/paddle/legacy/capi/examples/model_inference/README.md
deleted file mode 100644
index 58e6c8314..000000000
--- a/paddle/legacy/capi/examples/model_inference/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# Use C-API for Model Inference
-
-There are several examples in this directory about how to use Paddle C-API for model inference.
-
-## Convert configuration file to protobuf binary.
-
-Firstly, the user should convert Paddle's model configuration file into a protobuf binary file. In each example directory, there is a file named `convert_protobin.sh`. It will convert `trainer_config.conf` into `trainer_config.bin`.
-
-The `convert_protobin.sh` is very simple, just invoke `dump_config` Python module to dump the binary file. The command line usages are:
-
-```bash
-python -m paddle.utils.dump_config YOUR_CONFIG_FILE 'CONFIG_EXTRA_ARGS' --binary > YOUR_CONFIG_FILE.bin
-```
-
-## Initialize paddle
-
-```c++
-char* argv[] = {"--use_gpu=False"};
-paddle_init(1, (char**)argv);
-```
-
-We must initialize global context before we invoke other interfaces in Paddle. The initialize commands just like the `paddle_trainer` command line arguments.  `paddle train --help`,  will show the list of arguments. The most important argument is `use_gpu` or not.
-
-## Load network and parameters
-
-```c
-paddle_gradient_machine machine;
-paddle_gradient_machine_create_for_inference(&machine, config_file_content, content_size));
-paddle_gradient_machine_load_parameter_from_disk(machine, "./some_where_to_params"));
-```
-
-The gradient machine is a Paddle concept, which represents a neural network can be forwarded and backward. We can create a gradient machine fo model inference, and load the parameter files from disk.
-
-Moreover, if we want to inference in multi-thread, we could create a thread local gradient machine which shared the same parameter by using `paddle_gradient_machine_create_shared_param` API. Please reference `multi_thread` as an example.
-
-## Create input
-
-The input of a neural network is an `arguments`. The examples in this directory will show how to construct different types of inputs for prediction. Please look at `dense`, `sparse_binary`, `sequence` for details.
-
-## Get inference
-
-After invoking `paddle_gradient_machine_forward`, we could get the output of the neural network.  The `value` matrix of output arguments will store the neural network output values. If the output is a `SoftmaxActivation`, the `value` matrix are the probabilities of each input samples. The height of output matrix is number of sample. The width is the number of categories.
diff --git a/paddle/legacy/capi/examples/model_inference/common/common.h b/paddle/legacy/capi/examples/model_inference/common/common.h
deleted file mode 100644
index 23248b0ca..000000000
--- a/paddle/legacy/capi/examples/model_inference/common/common.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#ifndef __CAPI_EXAMPLE_COMMON_H__
-#define __CAPI_EXAMPLE_COMMON_H__
-#include <stdio.h>
-#include <stdlib.h>
-
-#define CHECK(stmt)                                                      \
-  do {                                                                   \
-    paddle_error __err__ = stmt;                                         \
-    if (__err__ != kPD_NO_ERROR) {                                       \
-      fprintf(stderr, "Invoke paddle error %d in " #stmt "\n", __err__); \
-      exit(__err__);                                                     \
-    }                                                                    \
-  } while (0)
-
-void* read_config(const char* filename, long* size) {
-  FILE* file = fopen(filename, "r");
-  if (file == NULL) {
-    fprintf(stderr, "Open %s error\n", filename);
-    return NULL;
-  }
-  fseek(file, 0L, SEEK_END);
-  *size = ftell(file);
-  fseek(file, 0L, SEEK_SET);
-  void* buf = malloc(*size);
-  fread(buf, 1, *size, file);
-  fclose(file);
-  return buf;
-}
-#endif
diff --git a/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
deleted file mode 100644
index 008a488fd..000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-project(dense)
-cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
-set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
deleted file mode 100755
index 30ffc316e..000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-python -m paddle.utils.dump_config trainer_config.py '' --binary > trainer_config.bin
diff --git a/paddle/legacy/capi/examples/model_inference/dense/main.c b/paddle/legacy/capi/examples/model_inference/dense/main.c
deleted file mode 100644
index 90444889a..000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/main.c
+++ /dev/null
@@ -1,116 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <time.h>
-
-#include "../common/common.h"
-
-// Modify this path as needed.
-#define CONFIG_BIN "./trainer_config.bin"
-// Modify this path as needed.
-// This demo assumes that merged model is not used, then this path is the
-// directory storing all the trained parameters.
-// If the model is trained by PaddlePaddle V2 API, the model is saved as
-// a compressed file. You need to uncompress the compressed file first.
-#define MODEL_PATH "models/pass_4"
-
-int main() {
-  // Initalize the PaddlePaddle runtime environment.
-  char* argv[] = {"--use_gpu=False"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Read the binary configuration file generated by `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create the gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-
-  // Load the trained model. Modify the parameter MODEL_PATH to set the correct
-  // path of the trained model.
-  CHECK(paddle_gradient_machine_load_parameter_from_disk(machine, MODEL_PATH));
-
-  // Inputs and outputs of the network are organized as paddle_arguments object
-  // in C-API. In the comments below, "argument" specifically means one input of
-  // the neural network in PaddlePaddle C-API.
-  paddle_arguments in_args = paddle_arguments_create_none();
-
-  // There is only one data layer in this demo MNIST network, invoke this
-  // function to create one argument.
-  CHECK(paddle_arguments_resize(in_args, 1));
-
-  // Each argument needs one matrix or one ivector (integer vector, for sparse
-  // index input, usually used in NLP task) to holds the real input data.
-  // In the comments below, "matrix" specifically means the object needed by
-  // argument to hold the data. Here we create the matrix for the above created
-  // agument to store the testing samples.
-  paddle_matrix mat =
-      paddle_matrix_create(/* height = batch size */ 1,
-                           /* width = dimensionality of the data layer */ 784,
-                           /* whether to use GPU */ false);
-
-  paddle_real* array;
-  // Get the pointer pointing to the start address of the first row of the
-  // created matrix.
-  CHECK(paddle_matrix_get_row(mat, 0, &array));
-
-  // Fill the matrix with a randomly generated test sample.
-  srand(time(0));
-  for (int i = 0; i < 784; ++i) {
-    array[i] = rand() / ((float)RAND_MAX);
-  }
-
-  // Assign the matrix to the argument.
-  CHECK(paddle_arguments_set_value(in_args, 0, mat));
-
-  // Create the output argument.
-  paddle_arguments out_args = paddle_arguments_create_none();
-
-  // Invoke the forward computation.
-  CHECK(paddle_gradient_machine_forward(machine,
-                                        in_args,
-                                        out_args,
-                                        /* is train taks or not */ false));
-
-  // Create the matrix to hold the forward result of the neural network.
-  paddle_matrix prob = paddle_matrix_create_none();
-  // Access the matrix of the output argument, the predicted result is stored in
-  // which.
-  CHECK(paddle_arguments_get_value(out_args, 0, prob));
-
-  uint64_t height;
-  uint64_t width;
-  CHECK(paddle_matrix_get_shape(prob, &height, &width));
-  CHECK(paddle_matrix_get_row(prob, 0, &array));
-
-  printf("Prob: \n");
-  for (int i = 0; i < height * width; ++i) {
-    printf("%.4f ", array[i]);
-    if ((i + 1) % width == 0) {
-      printf("\n");
-    }
-  }
-  printf("\n");
-
-  // The cleaning up.
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_matrix_destroy(mat));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
deleted file mode 100644
index 673aba203..000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.utils.merge_model import merge_v2_model
-
-from mnist_v2 import network
-
-net = network(is_infer=True)
-param_file = "models/params_pass_4.tar"
-output_file = "output.paddle.model"
-merge_v2_model(net, param_file, output_file)
diff --git a/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py b/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
deleted file mode 100644
index 3fd15d658..000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import gzip
-import logging
-import argparse
-from PIL import Image
-import numpy as np
-
-import paddle.v2 as paddle
-from paddle.utils.dump_v2_config import dump_v2_config
-
-logger = logging.getLogger("paddle")
-logger.setLevel(logging.INFO)
-
-
-def multilayer_perceptron(img, layer_size, lbl_dim):
-    for idx, size in enumerate(layer_size):
-        hidden = paddle.layer.fc(input=(img if not idx else hidden),
-                                 size=size,
-                                 act=paddle.activation.Relu())
-    return paddle.layer.fc(input=hidden,
-                           size=lbl_dim,
-                           act=paddle.activation.Softmax())
-
-
-def network(input_dim=784, lbl_dim=10, is_infer=False):
-    images = paddle.layer.data(
-        name='pixel', type=paddle.data_type.dense_vector(input_dim))
-
-    predict = multilayer_perceptron(
-        images, layer_size=[128, 64], lbl_dim=lbl_dim)
-
-    if is_infer:
-        return predict
-    else:
-        label = paddle.layer.data(
-            name='label', type=paddle.data_type.integer_value(lbl_dim))
-        return paddle.layer.classification_cost(input=predict, label=label)
-
-
-def main(task="train", use_gpu=False, trainer_count=1, save_dir="models"):
-    if task == "train":
-        if not os.path.exists(save_dir):
-            os.mkdir(save_dir)
-
-        paddle.init(use_gpu=use_gpu, trainer_count=trainer_count)
-        cost = network()
-        parameters = paddle.parameters.create(cost)
-        optimizer = paddle.optimizer.Momentum(
-            learning_rate=0.1 / 128.0,
-            momentum=0.9,
-            regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-
-        trainer = paddle.trainer.SGD(cost=cost,
-                                     parameters=parameters,
-                                     update_equation=optimizer)
-
-        def event_handler(event):
-            if isinstance(event, paddle.event.EndIteration):
-                if event.batch_id % 100 == 0:
-                    logger.info("Pass %d, Batch %d, Cost %f, %s" %
-                                (event.pass_id, event.batch_id, event.cost,
-                                 event.metrics))
-            if isinstance(event, paddle.event.EndPass):
-                with gzip.open(
-                        os.path.join(save_dir, "params_pass_%d.tar" %
-                                     event.pass_id), "w") as f:
-                    trainer.save_parameter_to_tar(f)
-
-        trainer.train(
-            reader=paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.mnist.train(), buf_size=8192),
-                batch_size=128),
-            event_handler=event_handler,
-            num_passes=5)
-    elif task == "dump_config":
-        predict = network(is_infer=True)
-        dump_v2_config(predict, "trainer_config.bin", True)
-    else:
-        raise RuntimeError(("Error value for parameter task. "
-                            "Available options are: train and dump_config."))
-
-
-def parse_cmd():
-    parser = argparse.ArgumentParser(
-        description="PaddlePaddle MNIST demo for CAPI.")
-    parser.add_argument(
-        "--task",
-        type=str,
-        required=False,
-        help=("A string indicating the taks type. "
-              "Available options are: \"train\", \"dump_config\"."),
-        default="train")
-    parser.add_argument(
-        "--use_gpu",
-        type=bool,
-        help=("A bool flag indicating whether to use GPU device or not."),
-        default=False)
-    parser.add_argument(
-        "--trainer_count",
-        type=int,
-        help=("This parameter is only used in training task. It indicates "
-              "how many computing threads are created in training."),
-        default=1)
-    parser.add_argument(
-        "--save_dir",
-        type=str,
-        help=("This parameter is only used in training task. It indicates "
-              "path of the directory to save the trained models."),
-        default="models")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_cmd()
-    main(args.task, args.use_gpu, args.trainer_count, args.save_dir)
diff --git a/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py b/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
deleted file mode 100644
index eca2dce11..000000000
--- a/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore b/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
deleted file mode 100644
index fab7372d7..000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
+++ /dev/null
@@ -1,73 +0,0 @@
-# This file is used to ignore files which are generated
-# ----------------------------------------------------------------------------
-
-*~
-*.autosave
-*.a
-*.core
-*.moc
-*.o
-*.obj
-*.orig
-*.rej
-*.so
-*.so.*
-*_pch.h.cpp
-*_resource.rc
-*.qm
-.#*
-*.*#
-core
-!core/
-tags
-.DS_Store
-.directory
-*.debug
-Makefile*
-*.prl
-*.app
-moc_*.cpp
-ui_*.h
-qrc_*.cpp
-Thumbs.db
-*.res
-*.rc
-/.qmake.cache
-/.qmake.stash
-
-# qtcreator generated files
-*.pro.user*
-
-# xemacs temporary files
-*.flc
-
-# Vim temporary files
-.*.swp
-
-# Visual Studio generated files
-*.ib_pdb_index
-*.idb
-*.ilk
-*.pdb
-*.sln
-*.suo
-*.vcproj
-*vcproj.*.*.user
-*.ncb
-*.sdf
-*.opensdf
-*.vcxproj
-*vcxproj.*
-
-# MinGW generated files
-*.Debug
-*.Release
-
-# Python byte code
-*.pyc
-
-# Binaries
-# --------
-*.dll
-*.exe
-
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
deleted file mode 100644
index 2fc8debdd..000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-project(multi_thread)
-cmake_minimum_required(VERSION 2.8)
-
-find_package (Threads)
-
-if(NOT PADDLE_ROOT)
-  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
-endif()
-if(PADDLE_ROOT)
-  include_directories(${PADDLE_ROOT}/include)
-  link_directories(${PADDLE_ROOT}/lib)
-endif()
-
-set(CPU_SRCS main.c)
-add_executable(${PROJECT_NAME} ${CPU_SRCS})
-set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME}
-                      -lpaddle_capi_shared
-                      ${CMAKE_THREAD_LIBS_INIT})
-
-find_package(CUDA QUIET)
-if(CUDA_FOUND)
-  set(GPU_SRCS main_gpu.c)
-  cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
-  set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
-  target_link_libraries(${PROJECT_NAME}_gpu
-                        -lpaddle_capi_shared
-                        ${CMAKE_THREAD_LIBS_INIT})
-endif(CUDA_FOUND)
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
deleted file mode 100644
index b29f2cd21..000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/main.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main.c
deleted file mode 100644
index 0a99e6b9c..000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/main.c
+++ /dev/null
@@ -1,112 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <pthread.h>
-#include <time.h>
-#include "../common/common.h"
-
-#define CONFIG_BIN "./trainer_config.bin"
-#define NUM_THREAD 4
-#define NUM_ITER 1000
-
-pthread_mutex_t mutex;
-
-void* thread_main(void* gm_ptr) {
-  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
-  paddle_arguments in_args = paddle_arguments_create_none();
-  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
-                                           /* size */ 784,
-                                           /* useGPU */ false);
-  paddle_arguments out_args = paddle_arguments_create_none();
-  paddle_matrix prob = paddle_matrix_create_none();
-  for (int iter = 0; iter < NUM_ITER; ++iter) {
-    // There is only one input of this network.
-    CHECK(paddle_arguments_resize(in_args, 1));
-
-    paddle_real* array;
-
-    // Get First row.
-    CHECK(paddle_matrix_get_row(mat, 0, &array));
-
-    for (int i = 0; i < 784; ++i) {
-      array[i] = rand() / ((float)RAND_MAX);
-    }
-
-    CHECK(paddle_arguments_set_value(in_args, 0, mat));
-
-    CHECK(paddle_gradient_machine_forward(machine,
-                                          in_args,
-                                          out_args,
-                                          /* isTrain */ false));
-
-    CHECK(paddle_arguments_get_value(out_args, 0, prob));
-
-    CHECK(paddle_matrix_get_row(prob, 0, &array));
-
-    pthread_mutex_lock(&mutex);
-    printf("Prob: ");
-    for (int i = 0; i < 10; ++i) {
-      printf("%.2f ", array[i]);
-    }
-    printf("\n");
-    pthread_mutex_unlock(&mutex);
-  }
-
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_matrix_destroy(mat));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-  return NULL;
-}
-
-int main() {
-  // Initalize Paddle
-  char* argv[] = {"--use_gpu=False"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Reading config binary file. It is generated by `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create a gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
-
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
-  srand(time(0));
-  pthread_mutex_init(&mutex, NULL);
-
-  pthread_t threads[NUM_THREAD];
-
-  for (int i = 0; i < NUM_THREAD; ++i) {
-    paddle_gradient_machine thread_local_machine;
-    CHECK(paddle_gradient_machine_create_shared_param(
-        machine, buf, size, &thread_local_machine));
-    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
-  }
-
-  for (int i = 0; i < NUM_THREAD; ++i) {
-    pthread_join(threads[i], NULL);
-  }
-
-  pthread_mutex_destroy(&mutex);
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
deleted file mode 100644
index 60f0c59e7..000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
+++ /dev/null
@@ -1,127 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <pthread.h>
-#include <time.h>
-#include "../common/common.h"
-
-#define CONFIG_BIN "./trainer_config.bin"
-#define NUM_THREAD 4
-#define NUM_ITER 1000
-
-pthread_mutex_t mutex;
-
-/*
- * @brief It is an simple inference example that runs multi-threads on a GPU.
- *        Each thread holds it own local gradient_machine but shares the same
- *        parameters.
- *        If you want to run on different GPUs, you need to launch
- *        multi-processes or set trainer_count > 1.
- */
-void* thread_main(void* gm_ptr) {
-  // Initialize the thread environment of Paddle.
-  CHECK(paddle_init_thread());
-
-  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
-  // Create input arguments.
-  paddle_arguments in_args = paddle_arguments_create_none();
-  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
-                                           /* size */ 784,
-                                           /* useGPU */ true);
-  // Create output arguments.
-  paddle_arguments out_args = paddle_arguments_create_none();
-  // Create output matrix.
-  paddle_matrix prob = paddle_matrix_create_none();
-
-  // CPU buffer to cache the input and output.
-  paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
-  paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
-  for (int iter = 0; iter < NUM_ITER; ++iter) {
-    // There is only one input layer of this network.
-    CHECK(paddle_arguments_resize(in_args, 1));
-    CHECK(paddle_arguments_set_value(in_args, 0, mat));
-
-    for (int i = 0; i < 784; ++i) {
-      cpu_input[i] = rand() / ((float)RAND_MAX);
-    }
-    CHECK(paddle_matrix_set_value(mat, cpu_input));
-
-    CHECK(paddle_gradient_machine_forward(machine,
-                                          in_args,
-                                          out_args,
-                                          /* isTrain */ false));
-
-    CHECK(paddle_arguments_get_value(out_args, 0, prob));
-    CHECK(paddle_matrix_get_value(prob, cpu_output));
-
-    pthread_mutex_lock(&mutex);
-    printf("Prob: ");
-    for (int i = 0; i < 10; ++i) {
-      printf("%.2f ", cpu_output[i]);
-    }
-    printf("\n");
-    pthread_mutex_unlock(&mutex);
-  }
-
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_matrix_destroy(mat));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-
-  free(cpu_input);
-  free(cpu_output);
-
-  return NULL;
-}
-
-int main() {
-  // Initalize Paddle
-  char* argv[] = {"--use_gpu=True"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Reading config binary file. It is generated by `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create a gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
-
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
-  srand(time(0));
-  pthread_mutex_init(&mutex, NULL);
-
-  pthread_t threads[NUM_THREAD];
-
-  for (int i = 0; i < NUM_THREAD; ++i) {
-    paddle_gradient_machine thread_local_machine;
-    CHECK(paddle_gradient_machine_create_shared_param(
-        machine, buf, size, &thread_local_machine));
-    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
-  }
-
-  for (int i = 0; i < NUM_THREAD; ++i) {
-    pthread_join(threads[i], NULL);
-  }
-
-  pthread_mutex_destroy(&mutex);
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py b/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
deleted file mode 100755
index fa6a12319..000000000
--- a/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reservedd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/.gitignore b/paddle/legacy/capi/examples/model_inference/sequence/.gitignore
deleted file mode 100644
index fab7372d7..000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/.gitignore
+++ /dev/null
@@ -1,73 +0,0 @@
-# This file is used to ignore files which are generated
-# ----------------------------------------------------------------------------
-
-*~
-*.autosave
-*.a
-*.core
-*.moc
-*.o
-*.obj
-*.orig
-*.rej
-*.so
-*.so.*
-*_pch.h.cpp
-*_resource.rc
-*.qm
-.#*
-*.*#
-core
-!core/
-tags
-.DS_Store
-.directory
-*.debug
-Makefile*
-*.prl
-*.app
-moc_*.cpp
-ui_*.h
-qrc_*.cpp
-Thumbs.db
-*.res
-*.rc
-/.qmake.cache
-/.qmake.stash
-
-# qtcreator generated files
-*.pro.user*
-
-# xemacs temporary files
-*.flc
-
-# Vim temporary files
-.*.swp
-
-# Visual Studio generated files
-*.ib_pdb_index
-*.idb
-*.ilk
-*.pdb
-*.sln
-*.suo
-*.vcproj
-*vcproj.*.*.user
-*.ncb
-*.sdf
-*.opensdf
-*.vcxproj
-*vcxproj.*
-
-# MinGW generated files
-*.Debug
-*.Release
-
-# Python byte code
-*.pyc
-
-# Binaries
-# --------
-*.dll
-*.exe
-
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
deleted file mode 100644
index 71b73acba..000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-project(sequence)
-cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
-set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
deleted file mode 100644
index b29f2cd21..000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/main.c b/paddle/legacy/capi/examples/model_inference/sequence/main.c
deleted file mode 100644
index 25a38d32f..000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/main.c
+++ /dev/null
@@ -1,84 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <time.h>
-#include "../common/common.h"
-
-#define CONFIG_BIN "./trainer_config.bin"
-
-int main() {
-  // Initalize Paddle
-  char* argv[] = {"--use_gpu=False"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Reading config binary file. It is generated by `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create a gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
-
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
-  paddle_arguments in_args = paddle_arguments_create_none();
-
-  // There is only one input of this network.
-  CHECK(paddle_arguments_resize(in_args, 1));
-
-  // Create input ids.
-  int sentence_ids[] = {83, 48, 20, 84, 394, 853, 64, 53, 64};
-
-  paddle_ivector sentence = paddle_ivector_create(
-      sentence_ids, sizeof(sentence_ids) / sizeof(int), false, false);
-  CHECK(paddle_arguments_set_ids(in_args, 0, sentence));
-
-  int seq_pos_array[] = {0, sizeof(sentence_ids) / sizeof(int)};
-
-  paddle_ivector seq_pos = paddle_ivector_create(
-      seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
-
-  CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
-
-  paddle_arguments out_args = paddle_arguments_create_none();
-  CHECK(paddle_gradient_machine_forward(machine,
-                                        in_args,
-                                        out_args,
-                                        /* isTrain */ false));
-  paddle_matrix prob = paddle_matrix_create_none();
-
-  CHECK(paddle_arguments_get_value(out_args, 0, prob));
-
-  paddle_real* array;
-
-  CHECK(paddle_matrix_get_row(prob, 0, &array));
-
-  printf("Prob: ");
-  for (int i = 0; i < 2; ++i) {
-    printf("%.2f ", array[i]);
-  }
-  printf("\n");
-
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_ivector_destroy(seq_pos));
-  CHECK(paddle_ivector_destroy(sentence));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
deleted file mode 100644
index 62ae97e26..000000000
--- a/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-WORD_DIM = 3000
-
-sentence = data_layer(name='sentence', size=WORD_DIM)
-sentence_embedding = embedding_layer(
-    input=sentence,
-    size=64,
-    param_attr=ParameterAttribute(
-        initial_max=1.0, initial_min=0.5))
-lstm = simple_lstm(input=sentence_embedding, size=64)
-lstm_last = last_seq(input=lstm)
-outputs(fc_layer(input=lstm_last, size=2, act=SoftmaxActivation()))
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore b/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
deleted file mode 100644
index fab7372d7..000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
+++ /dev/null
@@ -1,73 +0,0 @@
-# This file is used to ignore files which are generated
-# ----------------------------------------------------------------------------
-
-*~
-*.autosave
-*.a
-*.core
-*.moc
-*.o
-*.obj
-*.orig
-*.rej
-*.so
-*.so.*
-*_pch.h.cpp
-*_resource.rc
-*.qm
-.#*
-*.*#
-core
-!core/
-tags
-.DS_Store
-.directory
-*.debug
-Makefile*
-*.prl
-*.app
-moc_*.cpp
-ui_*.h
-qrc_*.cpp
-Thumbs.db
-*.res
-*.rc
-/.qmake.cache
-/.qmake.stash
-
-# qtcreator generated files
-*.pro.user*
-
-# xemacs temporary files
-*.flc
-
-# Vim temporary files
-.*.swp
-
-# Visual Studio generated files
-*.ib_pdb_index
-*.idb
-*.ilk
-*.pdb
-*.sln
-*.suo
-*.vcproj
-*vcproj.*.*.user
-*.ncb
-*.sdf
-*.opensdf
-*.vcxproj
-*vcxproj.*
-
-# MinGW generated files
-*.Debug
-*.Release
-
-# Python byte code
-*.pyc
-
-# Binaries
-# --------
-*.dll
-*.exe
-
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
deleted file mode 100644
index c82195688..000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-project(sparse_binary)
-cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
-find_package (Threads)
-set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
deleted file mode 100644
index b29f2cd21..000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
+++ /dev/null
@@ -1 +0,0 @@
-../dense/convert_protobin.sh
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c b/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
deleted file mode 100644
index 8df1b6008..000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
+++ /dev/null
@@ -1,87 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <paddle/capi.h>
-#include <time.h>
-
-#include "../common/common.h"
-
-#define CONFIG_BIN "./trainer_config.bin"
-
-int main() {
-  // Initalize Paddle
-  char* argv[] = {"--use_gpu=False"};
-  CHECK(paddle_init(1, (char**)argv));
-
-  // Read the binary configuration file which is generated by
-  // `convert_protobin.sh`
-  long size;
-  void* buf = read_config(CONFIG_BIN, &size);
-
-  // Create the gradient machine for inference.
-  paddle_gradient_machine machine;
-  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
-
-  // Load the trained parameters. Uncomment the following line and change the
-  // directory as needed.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
-  paddle_arguments in_args = paddle_arguments_create_none();
-
-  // There is only one input of this network.
-  CHECK(paddle_arguments_resize(in_args, 1));
-
-  // Create the input matrix.
-  paddle_matrix mat = paddle_matrix_create_sparse(1, 784, 3, true, false);
-  srand(time(0));
-  paddle_real* array;
-  int colBuf[] = {9, 93, 109};
-  int rowBuf[] = {0, sizeof(colBuf) / sizeof(int)};
-
-  CHECK(paddle_matrix_sparse_copy_from(mat,
-                                       rowBuf,
-                                       sizeof(rowBuf) / sizeof(int),
-                                       colBuf,
-                                       sizeof(colBuf) / sizeof(int),
-                                       NULL,
-                                       0));
-
-  CHECK(paddle_arguments_set_value(in_args, 0, mat));
-
-  paddle_arguments out_args = paddle_arguments_create_none();
-  CHECK(paddle_gradient_machine_forward(machine,
-                                        in_args,
-                                        out_args,
-                                        /* isTrain */ false));
-  paddle_matrix prob = paddle_matrix_create_none();
-
-  CHECK(paddle_arguments_get_value(out_args, 0, prob));
-
-  CHECK(paddle_matrix_get_row(prob, 0, &array));
-
-  printf("Prob: ");
-  for (int i = 0; i < 10; ++i) {
-    printf("%.2f ", array[i]);
-  }
-  printf("\n");
-
-  CHECK(paddle_matrix_destroy(prob));
-  CHECK(paddle_arguments_destroy(out_args));
-  CHECK(paddle_matrix_destroy(mat));
-  CHECK(paddle_arguments_destroy(in_args));
-  CHECK(paddle_gradient_machine_destroy(machine));
-
-  return 0;
-}
diff --git a/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
deleted file mode 100755
index fa6a12319..000000000
--- a/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reservedd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/capi/gradient_machine.cpp b/paddle/legacy/capi/gradient_machine.cpp
deleted file mode 100644
index 0c5ddd856..000000000
--- a/paddle/legacy/capi/gradient_machine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gradient_machine.h"
-#include "capi_private.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-
-#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = 0,
-  CREATE_MODE_TESTING = 4
-};
-
-namespace paddle {
-
-class MyNeuralNetwork : public NeuralNetwork {
- public:
-  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
-      : NeuralNetwork(name, network) {}
-};
-
-NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                      NeuralNetwork* network) {
-  return new MyNeuralNetwork(name, network);
-}
-}  // namespace paddle
-
-extern "C" {
-paddle_error paddle_gradient_machine_create_for_inference(
-    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
-  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
-    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
-  if (mergedModel == nullptr) return kPD_NULLPTR;
-  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
-  int64_t modelConfigSize = 0;
-  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
-  std::string modelConfigProtobuf;
-  modelConfigProtobuf.resize(modelConfigSize);
-  is.read(&modelConfigProtobuf[0], modelConfigSize);
-  paddle::TrainerConfig config;
-  paddle::ModelConfig modelConfig;
-  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
-    if (!modelConfig.ParseFromString(modelConfigProtobuf) ||
-        !modelConfig.IsInitialized()) {
-      return kPD_PROTOBUF_ERROR;
-    }
-  } else {
-    modelConfig = config.model_config();
-  }
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      modelConfig, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
-  for (auto& para : parameters) {
-    para->load(is);
-  }
-
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
-  delete cast(machine);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_load_parameter_from_disk(
-    paddle_gradient_machine machine, const char* path) {
-  auto m = cast(machine);
-  if (m == nullptr || path == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->loadParameters(path);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
-                                             paddle_arguments inArgs,
-                                             paddle_arguments outArgs,
-                                             bool isTrain) {
-  auto m = cast(machine);
-  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
-  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->forward(
-      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_shared_param(
-    paddle_gradient_machine origin,
-    void* modelConfigProtobuf,
-    int size,
-    paddle_gradient_machine* slave) {
-  auto o = cast(origin);
-  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
-      new paddle::capi::CGradientMachine());
-  auto nn = paddle::NeuralNetwork::create(config);
-  nn->init(config,
-           [&o](int paramId, paddle::Parameter* param) {
-             auto p = o->machine->getParameters()[paramId];
-             param->enableSharedType(paddle::PARAMETER_VALUE,
-                                     p->getBuf(paddle::PARAMETER_VALUE));
-           },
-           {paddle::PARAMETER_VALUE},
-           false);
-  ptr->machine.reset(nn);
-  *slave = ptr.release();
-  return kPD_NO_ERROR;
-}
-}
-
-paddle_error paddle_gradient_machine_randomize_param(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
-  m->machine->randParameters();
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_get_layer_output(
-    paddle_gradient_machine machine,
-    const char* layerName,
-    paddle_arguments args) {
-  auto m = cast(machine);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
-  if (m == nullptr || layerName == nullptr || out == nullptr ||
-      m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-
-  auto layerOutput = m->machine->getLayerOutput(layerName);
-  out->args.push_back(layerOutput);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_release_layer_output(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  m->machine->releaseOutput();
-  return kPD_NO_ERROR;
-}
diff --git a/paddle/legacy/capi/gradient_machine.h b/paddle/legacy/capi/gradient_machine.h
deleted file mode 100644
index f46498b37..000000000
--- a/paddle/legacy/capi/gradient_machine.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_GRADIENT_MACHINE_H__
-#define __PADDLE_CAPI_GRADIENT_MACHINE_H__
-#include "arguments.h"
-#include "config.h"
-#include "error.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-/**
- * @brief GradientMachine means a neural network.
- */
-typedef void* paddle_gradient_machine;
-
-/**
- * @brief Create a gradient machine used for model inference.
- * @param [out] machine that used for model inference.
- * @param [in] modelConfigProtobuf
- * @param [in] size
- * @return paddle_error
- */
-PD_API paddle_error paddle_gradient_machine_create_for_inference(
-    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);
-
-/**
- * @brief Create a gradient machine used for model inference, using config with
- *        parameters which is generated by `paddle merge_model`.
- *        Example:
- *          paddle merge_model \
- *                 --model_dir="pass-00000" \
- *                 --model_file="merged_model.paddle"
- * @param [out] machine that used for model inference
- * @param [in] mergedModel
- * @param [in] size
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_create_for_inference_with_parameters(
-    paddle_gradient_machine* machine, void* mergedModel, uint64_t size);
-
-/**
- * @brief Load parameter from disk.
- * @param machine Gradient Machine.
- * @param path local directory path.
- * @return paddle_error
- */
-PD_API paddle_error paddle_gradient_machine_load_parameter_from_disk(
-    paddle_gradient_machine machine, const char* path);
-
-/**
- * @brief Forward a gradient machine
- * @param machine Gradient machine
- * @param inArgs input arguments
- * @param outArgs output arguments
- * @param isTrain is train or not
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_forward(paddle_gradient_machine machine,
-                                paddle_arguments inArgs,
-                                paddle_arguments outArgs,
-                                bool isTrain);
-
-/**
- * @brief Create a gradient machine, which parameters are shared from another
- *        gradient machine.
- * @param [in] origin gradient machine
- * @param [in] modelConfigProtobuf model config protobuf
- * @param [in] size of model config buffer.
- * @param [out] slave gradient machine, the output value.
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_create_shared_param(paddle_gradient_machine origin,
-                                            void* modelConfigProtobuf,
-                                            int size,
-                                            paddle_gradient_machine* slave);
-
-PD_API paddle_error
-paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
-
-/**
- * @brief Destroy a gradient machine
- * @param machine that need to destroy
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_destroy(paddle_gradient_machine machine);
-
-/**
- * @brief Get the output of the layer named `layerName`.
- * @param [in] gradient machine that have run a inference
- * @param [in] layerName name of specified layer
- * @param [out] args output of the specified layer
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
-                                         const char* layerName,
-                                         paddle_arguments args);
-
-/**
- * @brief Release the middle layer's output memory of the gradient machine.
- * @param [in] gradient machine that have run a inference
- * @return paddle_error
- */
-PD_API paddle_error
-paddle_gradient_machine_release_layer_output(paddle_gradient_machine machine);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/paddle/legacy/capi/main.h b/paddle/legacy/capi/main.h
deleted file mode 100644
index a0cb7bc29..000000000
--- a/paddle/legacy/capi/main.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_MAIN_H__
-#define __PADDLE_CAPI_MAIN_H__
-#include "config.h"
-#include "error.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Initialize Paddle.
- */
-PD_API paddle_error paddle_init(int argc, char** argv);
-
-/**
- * Initialize the thread environment of Paddle.
- * @note it is requisite for GPU runs but optional for CPU runs.
- *       For GPU runs, all threads will run on the same GPU devices.
- */
-PD_API paddle_error paddle_init_thread();
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/paddle/legacy/capi/matrix.h b/paddle/legacy/capi/matrix.h
deleted file mode 100644
index f6747f7b1..000000000
--- a/paddle/legacy/capi/matrix.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_MATRIX_H__
-#define __PADDLE_CAPI_MATRIX_H__
-
-#include <stdbool.h>
-#include <stdint.h>
-#include "config.h"
-#include "error.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Matrix functions. Return will be a paddle_error type.
- */
-typedef void* paddle_matrix;
-
-/**
- * @brief paddle_matrix_create Create a dense matrix
- * @param height matrix height.
- * @param width matrix width
- * @param useGpu use GPU of not
- * @return Matrix handler
- */
-PD_API paddle_matrix paddle_matrix_create(uint64_t height,
-                                          uint64_t width,
-                                          bool useGpu);
-
-/**
- * @brief paddle_matrix_create_sparse Create a sparse matrix.
- * @param height the matrix height.
- * @param width the matrix width.
- * @param nnz the number of non-zero elements.
- * @param isBinary is binary (either 1 or 0 in matrix) or not.
- * @param useGpu is using GPU or not.
- * @return paddle_matrix.
- * @note Mobile inference does not support this interface.
- */
-PD_API paddle_matrix paddle_matrix_create_sparse(
-    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
-
-/**
- * @brief paddle_matrix_destroy Destroy a matrix.
- * @param mat
- * @return paddle_error
- */
-PD_API paddle_error paddle_matrix_destroy(paddle_matrix mat);
-
-/**
- * @brief paddle_matrix_set_row Set a row to matrix.
- * @param mat Target Matrix
- * @param rowID Index of row
- * @param rowArray Row data.
- * @return paddle_error
- */
-PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
-                                          uint64_t rowID,
-                                          paddle_real* rowArray);
-
-/**
- * @brief paddle_matrix_set_value Set value to matrix.
- * @param mat Target Matrix
- * @param value Row data.
- * @return paddle_error
- * @note  value should contain enough element of data to init the mat
- */
-PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                            paddle_real* value);
-
-/**
- * @brief PDMatGetRow Get raw row buffer from matrix
- * @param [in] mat Target matrix
- * @param [in] rowID Index of row.
- * @param [out] rawRowBuffer Row Buffer
- * @return paddle_error
- */
-PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
-                                          uint64_t rowID,
-                                          paddle_real** rawRowBuffer);
-
-/**
- * @brief copy data from the matrix
- * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data
- * @return paddle_error
- * @note the space of the result should allocated before invoke this API
- */
-PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                            paddle_real* result);
-/**
- * @brief PDMatCreateNone Create None Matrix
- * @return
- */
-PD_API paddle_matrix paddle_matrix_create_none();
-
-/**
- * @brief PDMatGetShape get the shape of matrix
- * @param mat target matrix
- * @param height The height of matrix
- * @param width The width of matrix
- * @return paddle_error
- */
-PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
-                                            uint64_t* height,
-                                            uint64_t* width);
-
-/**
- * @brief paddle_matrix_sparse_copy_from Copy from a CSR format matrix
- * @param [out] mat output matrix
- * @param [in] rowArray row array. The array slices in column array.
- * @param [in] rowSize length of row array.
- * @param [in] colArray the column array. It means the non-zero element indices
- * in each row.
- * @param [in] colSize length of column array.
- * @param [in] valueArray the value array. It means the non-zero elemnt values.
- * NULL if the matrix is binary.
- * @param [in] valueSize length of value array. Zero if the matrix is binary.
- * @return paddle_error
- * @note Mobile inference does not support this interface.
- */
-PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
-                                                   int* rowArray,
-                                                   uint64_t rowSize,
-                                                   int* colArray,
-                                                   uint64_t colSize,
-                                                   float* valueArray,
-                                                   uint64_t valueSize);
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/paddle/legacy/capi/paddle_capi.map b/paddle/legacy/capi/paddle_capi.map
deleted file mode 100644
index 8d673f675..000000000
--- a/paddle/legacy/capi/paddle_capi.map
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-	global:
-		paddle_*;
-	local:
-		*;
-};
diff --git a/paddle/legacy/capi/tests/.gitignore b/paddle/legacy/capi/tests/.gitignore
deleted file mode 100644
index 7ab6be95e..000000000
--- a/paddle/legacy/capi/tests/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-w
-b
diff --git a/paddle/legacy/capi/tests/CMakeLists.txt b/paddle/legacy/capi/tests/CMakeLists.txt
deleted file mode 100644
index bb38ace62..000000000
--- a/paddle/legacy/capi/tests/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-add_unittest(capi_test_mats test_Vector.cpp
-  test_Matrix.cpp test_Arguments.cpp)
-
-target_include_directories(capi_test_mats PUBLIC ${PADDLE_CAPI_INC_PATH})
-target_link_libraries(capi_test_mats paddle_capi)
-
-if(NOT MOBILE_INFERENCE)
-    add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
-    target_include_directories(capi_test_gradientMachine PUBLIC
-      ${PADDLE_CAPI_INC_PATH})
-    target_link_libraries(capi_test_gradientMachine paddle_capi)
-    add_test(NAME capi_test_gradientMachine
-      COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
-endif()
diff --git a/paddle/legacy/capi/tests/test_Arguments.cpp b/paddle/legacy/capi/tests/test_Arguments.cpp
deleted file mode 100644
index 6fb379719..000000000
--- a/paddle/legacy/capi/tests/test_Arguments.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include "capi.h"
-#include "gtest/gtest.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-static std::vector<paddle_real> randomBuffer(size_t bufSize) {
-  auto& eng = paddle::ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
-  std::vector<paddle_real> retv;
-  retv.reserve(bufSize);
-  for (size_t i = 0; i < bufSize; ++i) {
-    retv.push_back(dist(eng));
-  }
-  return retv;
-}
-
-TEST(CAPIArguments, create) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle_arguments args = paddle_arguments_create_none();
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(args, &size));
-  ASSERT_EQ(0UL, size);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, value) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_matrix mat = paddle_matrix_create(128, 64, false);
-  for (size_t i = 0; i < 128; ++i) {
-    std::vector<paddle_real> sampleBuf = randomBuffer(64);
-    paddle_matrix_set_row(mat, i, sampleBuf.data());
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(args, 0, mat));
-
-  paddle_matrix val = paddle_matrix_create_none();
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(args, 0, val));
-
-  for (size_t i = 0; i < 128; ++i) {
-    paddle_real* row1;
-    paddle_real* row2;
-
-    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, i, &row1));
-    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(val, i, &row2));
-    ASSERT_EQ(row1, row2);
-  }
-
-  paddle_ivector ivec = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, ids) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_ivector ivec;
-  int array[3] = {1, 2, 3};
-  ivec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_ids(args, 0, ivec));
-
-  paddle_ivector val = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_ids(args, 0, val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-template <typename T1, typename T2>
-void testSequenceHelper(T1 setter, T2 getter) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_ivector ivec;
-  int array[3] = {1, 2, 3};
-  ivec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, setter(args, 0, ivec));
-
-  paddle_ivector val = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, getter(args, 0, val));
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(val, &size));
-
-  int* rawBuf;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get(val, &rawBuf));
-  for (size_t i = 0; i < size; ++i) {
-    ASSERT_EQ(array[i], rawBuf[i]);
-  }
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, Sequence) {
-  auto testSequence = [](uint32_t nestedLevel) {
-    testSequenceHelper(std::bind(paddle_arguments_set_sequence_start_pos,
-                                 std::placeholders::_1,
-                                 std::placeholders::_2,
-                                 nestedLevel,
-                                 std::placeholders::_3),
-                       std::bind(paddle_arguments_get_sequence_start_pos,
-                                 std::placeholders::_1,
-                                 std::placeholders::_2,
-                                 nestedLevel,
-                                 std::placeholders::_3));
-  };
-  for (uint32_t i = 0; i < 2; ++i) {  // test seq and sub-seq.
-    testSequence(i);
-  }
-}
diff --git a/paddle/legacy/capi/tests/test_GradientMachine.cpp b/paddle/legacy/capi/tests/test_GradientMachine.cpp
deleted file mode 100644
index 5d1b7cb6c..000000000
--- a/paddle/legacy/capi/tests/test_GradientMachine.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/legacy/trainer/TrainerConfigHelper.h>
-#include <stdlib.h>
-#include <string.h>
-#include <type_traits>
-#include "capi.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-static std::vector<paddle_real> randomBuffer(size_t bufSize) {
-  auto& eng = paddle::ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
-  std::vector<paddle_real> retv;
-  retv.reserve(bufSize);
-  for (size_t i = 0; i < bufSize; ++i) {
-    retv.push_back(dist(eng));
-  }
-  return retv;
-}
-
-TEST(GradientMachine, testPredict) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle::TrainerConfigHelper config("./test_predict_network.py");
-  std::string buffer;
-  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
-  paddle_gradient_machine machine;
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_for_inference(
-                &machine, &buffer[0], (int)buffer.size()));
-  std::unique_ptr<paddle::GradientMachine> gm(
-      paddle::GradientMachine::create(config.getModelConfig()));
-  ASSERT_NE(nullptr, gm);
-  gm->randParameters();
-  gm->saveParameters("./");
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
-
-  paddle_gradient_machine machineSlave;
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_shared_param(
-                machine, &buffer[0], (int)buffer.size(), &machineSlave));
-  std::swap(machineSlave, machine);
-  paddle_arguments outArgs = paddle_arguments_create_none();
-
-  paddle_arguments inArgs = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
-  paddle_matrix mat = paddle_matrix_create(1, 100, false);
-  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
-
-  auto data = randomBuffer(100);
-  paddle_real* rowPtr;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
-
-  uint64_t sz;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
-  ASSERT_EQ(1UL, sz);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
-  std::vector<paddle::Argument> paddleInArgs;
-  std::vector<paddle::Argument> paddleOutArgs;
-  paddleInArgs.resize(1);
-  paddleInArgs[0].value =
-      paddle::Matrix::create(data.data(), 1, 100, false, false);
-
-  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
-
-  auto matPaddle = paddleOutArgs[0].value;
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(matPaddle->getHeight(), height);
-  ASSERT_EQ(matPaddle->getWidth(), width);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  for (size_t i = 0; i < width; ++i) {
-    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
-  }
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
-  std::swap(machineSlave, machine);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  std::vector<char*> argvs;
-  argvs.push_back(strdup("--use_gpu=false"));
-  paddle_init((int)argvs.size(), argvs.data());
-  for (auto each : argvs) {
-    free(each);
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/capi/tests/test_Matrix.cpp b/paddle/legacy/capi/tests/test_Matrix.cpp
deleted file mode 100644
index 5ba051ae1..000000000
--- a/paddle/legacy/capi/tests/test_Matrix.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi.h"
-#include "gtest/gtest.h"
-
-TEST(CAPIMatrix, create) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle_matrix mat = paddle_matrix_create(128, 32, false);
-  std::vector<paddle_real> sampleRow;
-  sampleRow.resize(32);
-  for (size_t i = 0; i < sampleRow.size(); ++i) {
-    sampleRow[i] = 1.0 / (i + 1.0);
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_row(mat, 0, sampleRow.data()));
-  ASSERT_EQ(kPD_OUT_OF_RANGE,
-            paddle_matrix_set_row(mat, 128, sampleRow.data()));
-
-  paddle_real* arrayPtr;
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &arrayPtr));
-  for (size_t i = 0; i < sampleRow.size(); ++i) {
-    ASSERT_NEAR(sampleRow[i], arrayPtr[i], 1e-5);
-  }
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(128UL, height);
-  ASSERT_EQ(32UL, width);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-}
-
-TEST(CAPIMatrix, createNone) {
-  paddle_matrix mat = paddle_matrix_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-}
-
-TEST(CAPIMatrix, cpu_get_set_value) {
-  paddle_matrix mat = paddle_matrix_create(128, 32, false);
-  std::vector<paddle_real> sample;
-  std::vector<paddle_real> result;
-  sample.resize(128 * 32);
-  result.resize(128 * 32);
-  for (size_t i = 0; i < sample.size(); ++i) {
-    sample[i] = 1.0 / (i + 1.0);
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
-  for (size_t i = 0; i < sample.size(); ++i) {
-    ASSERT_NEAR(sample[i], result[i], 1e-5);
-  }
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(128UL, height);
-  ASSERT_EQ(32UL, width);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(CAPIMatrix, gpu_get_set_value) {
-  paddle_matrix mat = paddle_matrix_create(128, 32, true);
-  std::vector<paddle_real> sample;
-  std::vector<paddle_real> result;
-  sample.resize(128 * 32);
-  result.resize(128 * 32);
-  for (size_t i = 0; i < sample.size(); ++i) {
-    sample[i] = 1.0 / (i + 1.0);
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
-  for (size_t i = 0; i < sample.size(); ++i) {
-    ASSERT_NEAR(sample[i], result[i], 1e-5);
-  }
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(128UL, height);
-  ASSERT_EQ(32UL, width);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-}
-#endif
diff --git a/paddle/legacy/capi/tests/test_Vector.cpp b/paddle/legacy/capi/tests/test_Vector.cpp
deleted file mode 100644
index fa7407e48..000000000
--- a/paddle/legacy/capi/tests/test_Vector.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi.h"
-#include "gtest/gtest.h"
-
-TEST(CAPIVector, create) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle_ivector vec;
-  int array[3] = {1, 2, 3};
-  vec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_resize(vec, 1000));
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(vec, &size));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
-}
-
-TEST(CAPIVector, createNone) {
-  paddle_ivector vec = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
-}
diff --git a/paddle/legacy/capi/tests/test_predict_network.py b/paddle/legacy/capi/tests/test_predict_network.py
deleted file mode 100644
index b8efb2570..000000000
--- a/paddle/legacy/capi/tests/test_predict_network.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=100)
-
-x = data_layer(name='x', size=100)
-
-y = fc_layer(
-    input=x,
-    size=100,
-    bias_attr=ParamAttr(name='b'),
-    param_attr=ParamAttr(name='w'))
-
-outputs(y)
diff --git a/paddle/legacy/capi/vector.h b/paddle/legacy/capi/vector.h
deleted file mode 100644
index a79f7fdf7..000000000
--- a/paddle/legacy/capi/vector.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef __PADDLE_CAPI_VECTOR_H__
-#define __PADDLE_CAPI_VECTOR_H__
-
-#include <stdbool.h>
-#include <stdint.h>
-#include "config.h"
-#include "error.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/**
- * Int Vector Functions. Return will be a paddle_error type.
- */
-typedef void* paddle_ivector;
-
-/**
- * @brief Create an none int vector. It just a handler and store nothing. Used
- *        to get output from other api.
- * @return None int vector.
- */
-PD_API paddle_ivector paddle_ivector_create_none();
-
-/**
- * @brief paddle_ivector_create create a paddle int vector
- * @param array: input array.
- * @param size: input array size.
- * @param copy: memory copy or just use same memory. True if copy.
- * @param useGPU: True if use GPU
- * @return paddle_error
- */
-PD_API paddle_ivector paddle_ivector_create(int* array,
-                                            uint64_t size,
-                                            bool copy,
-                                            bool useGPU);
-
-/**
- * @brief paddle_ivector_destroy destory an int vector.
- * @param ivec vector to be destoried.
- * @return paddle_error
- */
-PD_API paddle_error paddle_ivector_destroy(paddle_ivector ivec);
-
-/**
- * @brief paddle_ivector_get get raw buffer stored inside this int vector. It
- * could be GPU memory if this int vector is stored in GPU.
- * @param [in] ivec int vector
- * @param [out] buffer the return buffer pointer.
- * @return paddle_error
- */
-PD_API paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer);
-
-/**
- * @brief paddle_ivector_resize resize the int vector.
- * @param [in] ivec: int vector
- * @param [in] size: size to change
- * @return paddle_error
- */
-PD_API paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size);
-
-/**
- * @brief paddle_ivector_get_size get the size of int vector.
- * @param [in] ivec: int vector
- * @param [out] size: return size of this int vector.
- * @return paddle_error
- */
-PD_API paddle_error paddle_ivector_get_size(paddle_ivector ivec,
-                                            uint64_t* size);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/paddle/legacy/cuda/CMakeLists.txt b/paddle/legacy/cuda/CMakeLists.txt
deleted file mode 100755
index 9bbb8de78..000000000
--- a/paddle/legacy/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-set(AVX_SOURCES
-    src/hl_math.cc
-    src/hl_avx_functions.cc
-)
-
-if(WITH_AVX)
-    set(CUDA_SOURCES
-        src/hl_time.cc
-        src/hl_cpu_functions.cc
-        ${AVX_SOURCES})
-else()
-    set(CUDA_SOURCES
-        src/hl_time.cc
-        src/hl_cpu_functions.cc)
-endif()
-
-set(CUDA_CXX_WITH_GPU_SOURCES
-    src/hl_cuda_cublas.cc
-    src/hl_cuda_cudnn.cc
-    src/hl_cuda_device.cc)
-
-if(WITH_GPU)
-    set(CUDA_CXX_SOURCES
-        src/hl_warpctc_wrap.cc
-        ${CUDA_CXX_WITH_GPU_SOURCES})
-
-    set_source_files_properties(${CUDA_CXX_SOURCES}
-                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
-else()
-    if (NOT MOBILE_INFERENCE)
-    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
-    endif()
-endif()
-
-set(CUDA_CU_SOURCES
-    src/hl_perturbation_util.cu
-    src/hl_cuda_aggregate.cu
-    src/hl_cuda_matrix.cu
-    src/hl_cuda_sparse.cu
-    src/hl_cuda_cnn.cu
-    src/hl_cuda_lstm.cu
-    src/hl_top_k.cu
-    src/hl_batch_transpose.cu
-    src/hl_batch_norm.cu
-    src/hl_cuda_sequence.cu
-    src/hl_table_apply.cu)
-
-set(CUDA_HEADERS
-    include/hl_time.h
-    include/hl_warpctc_wrap.h
-    include/hl_sequence.h
-    include/hl_cuda_cublas.h
-    include/hl_batch_transpose.h
-    include/hl_avx_functions.h
-    include/hl_sparse.h
-    include/hl_functions.h
-    include/hl_cuda_cudnn.h
-    include/hl_activation_functions.h
-    include/hl_base.h
-    include/stub/hl_cuda_cudnn_stub.h
-    include/stub/hl_cuda_stub.h
-    include/stub/hl_cuda_cublas_stub.h
-    include/stub/hl_cnn_stub.h
-    include/stub/hl_lstm_stub.h
-    include/stub/hl_sequence_stub.h
-    include/stub/hl_aggregate_stub.h
-    include/stub/hl_sparse_stub.h
-    include/stub/hl_matrix_stub.h
-    include/hl_aggregate.h
-    include/hl_cuda.h
-    include/hl_lstm.h
-    include/hl_table_apply.h
-    include/hl_gpu.h
-    include/hl_top_k.h
-    include/hl_matrix.h
-    include/hl_cnn.h)
-
-if(WITH_GPU)
-    cuda_add_library(paddle_cuda
-        ${CUDA_SOURCES}
-        ${CUDA_CU_SOURCES}
-        ${CUDA_CXX_SOURCES})
-else()
-    add_library(paddle_cuda
-                ${CUDA_SOURCES}
-                ${CUDA_CXX_SOURCES})
-endif()
-
-add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
diff --git a/paddle/legacy/cuda/include/hl_activation_functions.h b/paddle/legacy/cuda/include/hl_activation_functions.h
deleted file mode 100644
index 66a69db54..000000000
--- a/paddle/legacy/cuda/include/hl_activation_functions.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_ACTIVATION_FUNCTIONS_H_
-#define HL_ACTIVATION_FUNCTIONS_H_
-
-#include "hl_functions.h"
-
-/**
- * Active functions: sigmoid, relu, tanh and linear.
- */
-#define HPPL_ACTIVE_FUNCTION \
-  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
-
-namespace hppl {
-
-/**
- * Hppl supports sigmoid, relu, tanh, linear active functions
- * for neural networks' forward and backward activation.
- */
-template <class T>
-class Active {
- public:
-  typedef T (*forward)(T);
-  typedef T (*backward)(T, T);
-};
-
-#ifdef __NVCC__
-namespace gpu {
-static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
-static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}  // namespace gpu
-#else
-namespace cpu {
-static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
-static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}  // namespace cpu
-
-#ifdef __AVX__
-namespace avx {
-static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
-static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}  // namespace avx
-#endif
-#endif
-
-}  // namespace hppl
-
-#endif  // HL_ACTIVATION_FUNCTIONS_H_
diff --git a/paddle/legacy/cuda/include/hl_aggregate.h b/paddle/legacy/cuda/include/hl_aggregate.h
deleted file mode 100644
index 1ca26aa3b..000000000
--- a/paddle/legacy/cuda/include/hl_aggregate.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_AGGREGATE_H_
-#define HL_AGGREGATE_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Calculate the sum of each row of the matrix A_d.
- *
- * @param[in]    A_d     input matrix (M x N).
- * @param[out]   C_d     output matrix (M x 1).
- * @param[in]    dimM    matrix height.
- * @param[in]    dimN    matrix width.
- *
- */
-extern void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the maximum value of each row of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the minimum value of each row of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the sum of each column of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output Matrix (1 x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the maximum value of each column of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (1 x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   Calculate the minimum value of each column of the matrix A_d.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (1 x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN);
-
-/**
- * @brief   C_h = sum(A_d[i]).
- *
- * @param[in]   A_d     input(m).
- * @param[out]  C_h     output(host memory).
- * @param[in]   dimM    size of vector.
- *
- */
-extern void hl_vector_sum(real *A_d, real *C_h, int dimM);
-
-/**
- * @brief   C_h = sum(abs(A_d[i])).
- *
- * @param[in]   A_d     input(m).
- * @param[out]  C_h     output(host memory).
- * @param[in]   dimM    size of vector.
- *
- */
-extern void hl_vector_abs_sum(real *A_d, real *C_h, int dimM);
-
-#endif /* HL_AGGREGATE_H_ */
diff --git a/paddle/legacy/cuda/include/hl_avx_functions.h b/paddle/legacy/cuda/include/hl_avx_functions.h
deleted file mode 100644
index 9fb99a36e..000000000
--- a/paddle/legacy/cuda/include/hl_avx_functions.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_AVX_FUNCTIONS_H_
-#define HL_AVX_FUNCTIONS_H_
-
-#include <immintrin.h>
-
-namespace hppl {
-__m256 relu(const __m256 a);
-__m256 sigmoid(const __m256 a);
-__m256 tanh(const __m256 a);
-__m256 linear(const __m256 a);
-
-__m256 relu(const __m256 a, const __m256 b);
-__m256 sigmoid(const __m256 a, const __m256 b);
-__m256 tanh(const __m256 a, const __m256 b);
-__m256 linear(const __m256 a, const __m256 b);
-}  // namespace hppl
-
-#endif  // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/legacy/cuda/include/hl_base.h b/paddle/legacy/cuda/include/hl_base.h
deleted file mode 100644
index bfe812a43..000000000
--- a/paddle/legacy/cuda/include/hl_base.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-
-#ifdef PADDLE_TYPE_DOUBLE
-#define HL_FLOAT_MAX 3.40282347e+38F
-#define HL_FLOAT_MIN 1.17549435e-38F
-using real = double;
-#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
-using real = float;
-#endif
-
-/**
- * The maximum input value for exp, used to avoid overflow problem.
- * currently only used for tanh function.
- */
-#define EXP_MAX_INPUT 40.0
-
-/**
- * @brief DIVUP(x, y) is similar to ceil(x / y).
- * @note  For CUDA, DIVUP will be used to specify
- *        the size of blockDim.
- */
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
-#endif
-
-/**
- * HPPL is an internal high performance parallel computing library
- * for high-level neural network routines, which can support many
- * heterogeneous compute architectures, such as GPU, FPGA, etc.
- */
-
-/**
- * @brief   HPPL CUDA Stream.
- *
- * @note    Each thread can use HPPL_STREAM_* after calling hl_init.
- *          HPPL_STREAM_DEFAULT is HPPL default stream.
- */
-typedef enum {
-  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
-  HPPL_STREAM_1 = 1,
-  HPPL_STREAM_2 = 2,
-  HPPL_STREAM_3 = 3,
-  HPPL_STREAM_4 = 4,
-  HPPL_THREAD_STREAM_1 = 5,
-  HPPL_THREAD_STREAM_2 = 6,
-  HPPL_THREAD_STREAM_3 = 7,
-  HPPL_THREAD_STREAM_4 = 8,
-  HPPL_STREAM_END
-} hl_stream_t;
-
-/**
- * @brief HPPL activation mode.
- */
-typedef enum {
-  HL_ACTIVATION_SIGMOID = 0,
-  HL_ACTIVATION_RELU = 1,
-  HL_ACTIVATION_TANH = 2,
-  HL_ACTIVATION_LINEAR = 3,
-  HL_ACTIVATION_END
-} hl_activation_mode_t;
-
-/**
- * @brief Transpose type.
- */
-typedef enum {
-  HPPL_OP_N = 0, /* transpose */
-  HPPL_OP_T = 1, /* non transpose */
-  HPPL_OP_END
-} hl_trans_op_t;
-
-/**
- * @brief Lstm value.
- *
- * @param  gateValue         input value.
- * @param  prevStateValue    previous state value.
- * @param  stateValue        state value.
- * @param  stateActiveValue  state active value.
- * @param  outputValue       output value.
- */
-typedef struct {
-  real *gateValue;
-  real *prevStateValue;
-  real *stateValue;
-  real *stateActiveValue;
-  real *outputValue;
-  real *checkIg;
-  real *checkFg;
-  real *checkOg;
-} hl_lstm_value;
-
-/**
- * @brief Lstm gradient.
- *
- * @param  gateGrad          input gradient.
- * @param  prevStateGrad     previous state gradient.
- * @param  stateGrad         state gradient.
- * @param  stateActiveGrad   state active gradient.
- * @param  outputGrad        output gradient.
- */
-typedef struct {
-  real *gateGrad;
-  real *prevStateGrad;
-  real *stateGrad;
-  real *stateActiveGrad;
-  real *outputGrad;
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-} hl_lstm_grad;
-
-/**
- * @brief Gru value.
- *
- * @param  gateWeight           gate weight (updateGate + resetGate).
- * @param  stateWeight          frame state weight.
- * @param  gateValue            gate value results.
- * @param  resetOutputValue     resetOutput value.
- * @param  outputValue          output value.
- * @param  prevOutValue         previous output value.
- *
- */
-typedef struct {
-  real *gateWeight;
-  real *stateWeight;
-  real *gateValue;
-  real *resetOutputValue;
-  real *outputValue;
-  real *prevOutValue;
-} hl_gru_value;
-
-/**
- * @brief Gru gradient.
- *
- * @param  gateWeightGrad       gate weight gradient.
- * @param  stateWeightGrad      frame state weight gradient.
- * @param  gateGrad             gate gradient results.
- * @param  resetOutputGrad      resetOutput gradient.
- * @param  outputGrad           output gradient.
- * @param  prevOutGrad          previous output gradient.
- */
-typedef struct {
-  real *gateWeightGrad;
-  real *stateWeightGrad;
-  real *gateGrad;
-  real *resetOutputGrad;
-  real *outputGrad;
-  real *prevOutGrad;
-} hl_gru_grad;
-
-/**
- * @brief  Sparse matrix value type.
- */
-typedef enum {
-  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
-  HL_FLOAT_VALUE = 1,
-  HL_VALUE_END
-} hl_matrix_value_t;
-
-/**
- * @brief  HPPL matrix format.
- */
-typedef enum {
-  HL_SPARSE_CSR = 0,
-  HL_SPARSE_CSC = 1,
-  HL_SPARSE_END
-} hl_matrix_format_t;
-
-typedef struct _hl_matrix_s *hl_matrix_s;
-
-/**
- * @brief   HPPL sparse matrix.
- *
- * @param  matrix     sparse matrix.
- * @param  format     matrix format.
- * @param  type       the type of matrix values.
- * @param  rows       matrix rows.
- * @param  cols       matrix columns.
- * @param  nnz        nonzero values of sparse matrix.
- */
-typedef struct {
-  hl_matrix_s matrix;
-  hl_matrix_format_t format;
-  hl_matrix_value_t type;
-  int rows;
-  int cols;
-  size_t nnz;
-} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
-
-#ifdef __NVCC__
-
-#include <cuda_runtime.h>
-#include "paddle/legacy/cuda/include/hl_cuda.h"
-#include "paddle/legacy/utils/Logging.h"
-
-extern __thread bool g_sync_flag;
-extern __thread cudaStream_t default_stream;
-#define STREAM_DEFAULT default_stream
-
-/**
- * @brief   Check cuda kernel execution.
- * @param   msg   error string
- */
-#define CHECK_SYNC(msg)                                               \
-  if (true == g_sync_flag) {                                          \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
-    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
-    CHECK_EQ(cudaSuccess, err)                                        \
-        << "[" << msg << "] "                                         \
-        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
-  }
-
-// __shfl has been deprecated as of CUDA 9.0.
-#if CUDA_VERSION < 9000
-template <typename T>
-__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
-  return __shfl_down(val, delta);
-}
-
-template <typename T>
-__forceinline__ __device__ T
-__shfl_sync(unsigned, T val, int src_line, int width) {
-  return __shfl(val, src_line, width);
-}
-
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
-
-#endif  // __NVCC__
diff --git a/paddle/legacy/cuda/include/hl_batch_norm.h b/paddle/legacy/cuda/include/hl_batch_norm.h
deleted file mode 100644
index 7814204d1..000000000
--- a/paddle/legacy/cuda/include/hl_batch_norm.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_BATCH_NORM_H_
-#define HL_BATCH_NORM_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   batch norm inferece.
- *
- * @param[in]   input         input data.
- * @param[out]  output        output data.
- * @param[in]   scale         batch normalization scale parameter (in original
- *                            paper scale is referred to as gamma).
- * @param[in]   bias          batch normalization bias parameter (in original
- *                            paper scale is referred to as beta).
- * @param[in]   estimatedMean
- * @param[in]   estimatedVar  The moving mean and variance
- *                            accumulated during the training phase are passed
- *                            as inputs here.
- * @param[in]   epsilon       Epsilon value used in the batch
- *                            normalization formula.
- */
-extern void hl_batch_norm_cuda_inference(const real* input,
-                                         real* output,
-                                         const real* scale,
-                                         const real* bias,
-                                         const real* estimatedMean,
-                                         const real* estimatedVar,
-                                         const double epsilon,
-                                         size_t batchSize,
-                                         size_t channel,
-                                         size_t height,
-                                         size_t width);
-
-#endif  // HL_BATCH_NORM_H_
diff --git a/paddle/legacy/cuda/include/hl_batch_transpose.h b/paddle/legacy/cuda/include/hl_batch_transpose.h
deleted file mode 100644
index a16d3764f..000000000
--- a/paddle/legacy/cuda/include/hl_batch_transpose.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_BATCH_TRANSPOSE_H_
-#define HL_BATCH_TRANSPOSE_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Perform matrix transpose for each data in the batch.
- *
- * @param[in]   input     height * width elements in batch.
- * @param[out]  output    height * width elements in batch.
- * @param[in]   width     width of batch data.
- * @param[in]   height    height of batch data.
- * @param[in]   batchSize batch size
- *
- * @note    Both the inpt and output are arranged in batch-first
- *          order. Each batch has height * width data, which are
- *          arranged in height-first (or row-first) manner.
- */
-extern void batchTranspose(
-    const real* input, real* output, int width, int height, int batchSize);
-
-#endif  // HL_BATCH_TRANSPOSE_H_
diff --git a/paddle/legacy/cuda/include/hl_cnn.h b/paddle/legacy/cuda/include/hl_cnn.h
deleted file mode 100644
index b790fa39f..000000000
--- a/paddle/legacy/cuda/include/hl_cnn.h
+++ /dev/null
@@ -1,417 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CNN_H_
-#define HL_CNN_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Maximum pool forward with Mask output.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  tgtData     output data.
- * @param[in]   tgtStride   stride between output data samples.
- * @param[out]  maskData    the location indices of select max data.
- */
-extern void hl_maxpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               real* maskData = NULL);
-
-/**
- * @brief   Maximum pool backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[out]  outData     output data.
- * @param[out]  outGrad     output grad data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   scaleA      scale.
- * @param[in]   scaleB      scale.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  targetGrad  output grad.
- * @param[in]   outStride   stride between output data samples.
- *
- */
-extern void hl_maxpool_backward(const int frameCnt,
-                                const real* inputData,
-                                const real* outData,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                const int paddingH,
-                                const int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* targetGrad,
-                                const int outStride);
-
-/**
- * @brief   Averge pool forward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  tgtData     output data.
- * @param[in]   tgtStride   stride between output data samples.
- * @param[in]   excludeMode whether to consider paddings for size.
- *
- */
-extern void hl_avgpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               bool excludeMode);
-
-/**
- * @brief   Maximum pool backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   outGrad     output grad data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   scaleA      scale.
- * @param[in]   scaleB      scale.
- * @param[out]  backGrad    output grad.
- * @param[in]   outStride   stride between output data samples.
- * @param[in]   excludeMode whether to consider paddings for size.
- *
- */
-extern void hl_avgpool_backward(const int frameCnt,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                int paddingH,
-                                int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* backGrad,
-                                const int outStride,
-                                bool excludeMode);
-
-extern void hl_maxpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 real* maxPoolIdxData,
-                                 const int tgtStride);
-
-extern void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int paddingD,
-                                  const int paddingH,
-                                  const int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* targetGrad,
-                                  real* maxPoolIdxData,
-                                  const int outStride);
-
-extern void hl_avgpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 const int tgtStride);
-
-extern void hl_avgpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  int paddingD,
-                                  int paddingH,
-                                  int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* backGrad,
-                                  const int outStride);
-
-/**
- * @brief   Bilinear interpolation forward.
- *
- * @param[in]   inData      input value.
- * @param[in]   inImgH      input image height.
- * @param[in]   inImgW      input image width.
- * @param[in]   inputH      input batchSize.
- * @param[in]   inputW      input image data dim.
- * @param[out]  outData     output value.
- * @param[in]   outImgH     output image height.
- * @param[in]   outImgW     output image width.
- * @param[in]   outputH     output batchSize.
- * @param[in]   outputW     output image data dim.
- * @param[in]   numChannels number of channels.
- * @param[in]   ratioH      inImgH / outImgH.
- * @param[in]   ratioW      inImgW / outImgW.
- *
- */
-extern void hl_bilinear_forward(const real* inData,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t inputH,
-                                const size_t inputW,
-                                real* outData,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t outputH,
-                                const size_t outputW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW);
-
-/**
- * @brief   Bilinear interpolation backward.
- *
- * @param[out]  inGrad      input gradient.
- * @param[in]   inImgH      input image height.
- * @param[in]   inImgW      input image width.
- * @param[in]   inputH      input batchSize.
- * @param[in]   inputW      input image data dim.
- * @param[in]   outGrad     output gradient.
- * @param[in]   outImgH     output image height.
- * @param[in]   outImgW     output image width.
- * @param[in]   outputH     output batchSize.
- * @param[in]   outputW     output image data dim.
- * @param[in]   numChannels number of channels.
- * @param[in]   ratioH      inImgH / outImgH.
- * @param[in]   ratioW      inImgW / outImgW.
- *
- */
-extern void hl_bilinear_backward(real* inGrad,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t inputH,
-                                 const size_t inputW,
-                                 const real* outGrad,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t outputH,
-                                 const size_t outputW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW);
-
-/**
- * @brief   MaxOut forward.
- *
- * @param[in]   inData      input data.
- * @param[out]  outData     output data.
- * @param[out]  idData      output maxId.
- * @param[in]   batchSize   batchSize.
- * @param[in]   size        number of channels * image height * image width.
- * @param[in]   featLen     feature length = image height * image width.
- * @param[in]   groups      number of groups.
- */
-extern void hl_maxout_forward(const real* inData,
-                              real* outData,
-                              int* idData,
-                              size_t batchSize,
-                              size_t size,
-                              size_t featLen,
-                              size_t groups);
-
-/**
- * @brief   MaxOut backward.
- *
- * @param[out]  inGrad      input grad data.
- * @param[in]   outGrad     output grad data.
- * @param[in]   idData      output maxId.
- * @param[in]   batchSize   batchSize.
- * @param[in]   size        number of channels * image height * image width.
- * @param[in]   featLen     feature length = image height * image width.
- * @param[in]   groups      number of groups.
- */
-extern void hl_maxout_backward(real* inGrad,
-                               const real* outGrad,
-                               const int* idData,
-                               size_t batchSize,
-                               size_t size,
-                               size_t featLen,
-                               size_t groups);
-
-/**
- * @brief   Upsample forward.
- * @param[in]   inputData   input data.
- * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
- * @param[out]  batchSize   the batch size of the input.
- * @param[in]   imgSizeH    image height.
- * @param[in]   imgSizeW    image width.
- * @param[in]   channels    the input channels.
- * @param[in]   outputH     the output height.
- * @param[in]   outputW     the output widht.
- * @param[out]  outputData  output data.
- */
-extern void hl_upsample_forward(real* inputData,
-                                real* maskData,
-                                size_t batchSize,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW,
-                                real* outputData);
-
-/**
- * @brief   Upsample backward.
- * @param[in]   outputGradData  the output grad data.
- * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
- * @param[out]  batchSize       the batch size of the input.
- * @param[in]   imgSizeH        image height.
- * @param[in]   imgSizeW        image width.
- * @param[in]   channels        the input channels.
- * @param[in]   outputH         the output height.
- * @param[in]   outputW         the output widht.
- * @param[out]  inputGradData   the input grad data.
- */
-extern void hl_upsample_backward(real* outputGradData,
-                                 real* maskData,
-                                 size_t batchSize,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 real* inputGradData);
-
-#endif  // HL_CNN_H_
diff --git a/paddle/legacy/cuda/include/hl_cpu_gru.cuh b/paddle/legacy/cuda/include/hl_cpu_gru.cuh
deleted file mode 100644
index ce1643932..000000000
--- a/paddle/legacy/cuda/include/hl_cpu_gru.cuh
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_CPU_GRU_CUH_
-#define HL_CPU_GRU_CUH_
-
-#ifndef __NVCC__
-
-template<class OpResetOutput>
-void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
-                                       real *gateValue,
-                                       real *resetOutputValue,
-                                       real *prevOutputValue,
-                                       int frameSize,
-                                       hl_activation_mode_t active_gate) {
-  real rValueUpdateGate;
-  real rValueResetGate;
-  real rValueResetOutput;
-  real rPrevOut = 0;
-  real *updateGate = gateValue;
-  real *resetGate = gateValue + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
-    }
-
-    opResetOutput(rValueUpdateGate,
-                  rValueResetGate,
-                  rPrevOut,
-                  rValueResetOutput,
-                  hppl::cpu::forward[active_gate]);
-
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    resetOutputValue[i] = rValueResetOutput;
-  }
-}
-
-template<class OpFinalOutput>
-void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
-                                       real *gateValue,
-                                       real *prevOutputValue,
-                                       real *outputValue,
-                                       int frameSize,
-                                       hl_activation_mode_t active_node) {
-  real rValueUpdateGate;
-  real rValueFrameState;
-  real rPrevOut = 0;
-  real rOutput;
-  real *updateGate = gateValue;
-  real *frameState = gateValue + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
-    }
-
-    opFinalOutput(rValueUpdateGate,
-                  rValueFrameState,
-                  rPrevOut,
-                  rOutput,
-                  hppl::cpu::forward[active_node]);
-
-    frameState[i] = rValueFrameState;
-    outputValue[i] = rOutput;
-  }
-}
-
-template<class OpResetOutput>
-void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput,
-                                     real *gateValue,
-                                     real *resetOutputValue,
-                                     real *prevOutputValue,
-                                     int frameSize,
-                                     hl_activation_mode_t active_gate) {
-#ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueResetGate;
-  __m256 rValueResetOutput;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 *updateGate = (__m256*)gateValue;
-  __m256 *resetGate = (__m256*)(gateValue + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256*)prevOutputValue)[i];
-    }
-
-    opResetOutput(rValueUpdateGate,
-                  rValueResetGate,
-                  rPrevOut,
-                  rValueResetOutput,
-                  hppl::avx::forward[active_gate]);
-
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    ((__m256*)resetOutputValue)[i] = rValueResetOutput;
-  }
-#endif
-}
-
-template<class OpFinalOutput>
-void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput,
-                                     real *gateValue,
-                                     real *prevOutputValue,
-                                     real *outputValue,
-                                     int frameSize,
-                                     hl_activation_mode_t active_node) {
-#ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueFrameState;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 rOutput;
-  __m256 *updateGate = (__m256*)gateValue;
-  __m256 *frameState = (__m256*)(gateValue + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256*)prevOutputValue)[i];
-    }
-
-    opFinalOutput(rValueUpdateGate,
-                  rValueFrameState,
-                  rPrevOut,
-                  rOutput,
-                  hppl::avx::forward[active_node]);
-
-    frameState[i] = rValueFrameState;
-    ((__m256*)outputValue)[i] = rOutput;
-  }
-#endif
-}
-
-template<class OpResetOutput>
-inline void forward_reset_output(OpResetOutput opResetOutput,
-                                 hl_gru_value value,
-                                 int frameSize,
-                                 int batchSize,
-                                 hl_activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-      hl_avx_gru_forward_reset_output(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, active_gate);
-    } else {
-      hl_naive_gru_forward_reset_output(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, active_gate);
-    }
-
-    value.gateValue += frameSize * 3;
-    value.resetOutputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
-    }
-  }
-}
-
-template<class OpFinalOutput>
-inline void forward_final_output(OpFinalOutput opFinalOutput,
-                                 hl_gru_value value,
-                                 int frameSize,
-                                 int batchSize,
-                                 hl_activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-      hl_avx_gru_forward_final_output(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, active_node);
-    } else {
-      hl_naive_gru_forward_final_output(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, active_node);
-    }
-
-    value.gateValue += frameSize * 3;
-    value.outputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
-    }
-  }
-}
-
-template<class OpStateGrad>
-void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
-                                      real *gateValue,
-                                      real *gateGrad,
-                                      real *prevOutValue,
-                                      real *prevOutGrad,
-                                      real *outputGrad,
-                                      int frameSize,
-                                      hl_activation_mode_t active_node) {
-  real rUpdateGateValue;
-  real rUpdateGateGrad;
-  real rFrameStateValue;
-  real rFrameStateGrad;
-  real rOutGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real *updateGateValue = gateValue;
-  real *updateGateGrad = gateGrad;
-  real *frameStateValue = gateValue + frameSize * 2;
-  real *frameStateGrad = gateGrad + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad  = outputGrad[i];
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
-    }
-    if (prevOutGrad) {
-      rPrevOutGrad  = prevOutGrad[i];
-    }
-
-    opStateGrad(rUpdateGateValue,
-                rUpdateGateGrad,
-                rFrameStateValue,
-                rFrameStateGrad,
-                rPrevOutValue,
-                rPrevOutGrad,
-                rOutGrad,
-                hppl::cpu::backward[active_node]);
-
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
-    }
-  }
-}
-
-template<class OpResetGrad>
-void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad,
-                                      real *gateValue,
-                                      real *gateGrad,
-                                      real *prevOutValue,
-                                      real *prevOutGrad,
-                                      real *resetOutputGrad,
-                                      int frameSize,
-                                      hl_activation_mode_t active_gate) {
-  real rUpdateGateValue;
-  real rUpdateGateGrad;
-  real rResetGateValue;
-  real rResetGateGrad;
-  real rResetOutputGrad = 0;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real *updateGateValue = gateValue;
-  real *updateGateGrad = gateGrad;
-  real *resetGateValue = gateValue + frameSize;
-  real *resetGateGrad = gateGrad + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = resetOutputGrad[i];
-    }
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
-    }
-    if (prevOutGrad) {
-      rPrevOutGrad  = prevOutGrad[i];
-    }
-
-    opResetGrad(rUpdateGateValue,
-                rUpdateGateGrad,
-                rResetGateValue,
-                rResetGateGrad,
-                rPrevOutValue,
-                rPrevOutGrad,
-                rResetOutputGrad,
-                hppl::cpu::backward[active_gate]);
-
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
-    }
-  }
-}
-
-template<class OpStateGrad>
-void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad,
-                                    real *gateValue,
-                                    real *gateGrad,
-                                    real *prevOutValue,
-                                    real *prevOutGrad,
-                                    real *outputGrad,
-                                    int frameSize,
-                                    hl_activation_mode_t active_node) {
-#ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rFrameStateValue;
-  __m256 rFrameStateGrad;
-  __m256 rOutGrad;
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad  = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256*)gateValue;
-  __m256 *updateGateGrad = (__m256*)gateGrad;
-  __m256 *frameStateValue = (__m256*)(gateValue + frameSize * 2);
-  __m256 *frameStateGrad = (__m256*)(gateGrad + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad  = ((__m256*)outputGrad)[i];
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256*)prevOutValue)[i];
-    }
-    if (prevOutGrad) {
-      rPrevOutGrad  = ((__m256*)prevOutGrad)[i];
-    }
-
-    opStateGrad(rUpdateGateValue,
-                rUpdateGateGrad,
-                rFrameStateValue,
-                rFrameStateGrad,
-                rPrevOutValue,
-                rPrevOutGrad,
-                rOutGrad,
-                hppl::avx::backward[active_node]);
-
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      ((__m256*)prevOutGrad)[i] = rPrevOutGrad;
-    }
-  }
-#endif
-}
-
-template<class OpResetGrad>
-void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad,
-                                    real *gateValue,
-                                    real *gateGrad,
-                                    real *prevOutValue,
-                                    real *prevOutGrad,
-                                    real *resetOutputGrad,
-                                    int frameSize,
-                                    hl_activation_mode_t active_gate) {
-#ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rResetGateValue;
-  __m256 rResetGateGrad;
-  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad  = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256*)gateValue;
-  __m256 *updateGateGrad = (__m256*)gateGrad;
-  __m256 *resetGateValue = (__m256*)(gateValue + frameSize);
-  __m256 *resetGateGrad = (__m256*)(gateGrad + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = ((__m256*)resetOutputGrad)[i];
-    }
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256*)prevOutValue)[i];
-    }
-    if (prevOutGrad) {
-      rPrevOutGrad  = ((__m256*)prevOutGrad)[i];
-    }
-
-    opResetGrad(rUpdateGateValue,
-                rUpdateGateGrad,
-                rResetGateValue,
-                rResetGateGrad,
-                rPrevOutValue,
-                rPrevOutGrad,
-                rResetOutputGrad,
-                hppl::avx::backward[active_gate]);
-
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      ((__m256*)prevOutGrad)[i] = rPrevOutGrad;
-    }
-  }
-#endif
-}
-
-template<class OpStateGrad>
-inline void backward_state_grad(OpStateGrad opStateGrad,
-                                hl_gru_value value,
-                                hl_gru_grad  grad,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-      hl_avx_gru_backward_state_grad(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, active_node);
-    } else {
-      hl_naive_gru_backward_state_grad(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, active_node);
-    }
-
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
-    }
-
-    grad.gateGrad += frameSize * 3;
-    grad.outputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
-    }
-  }
-}
-
-template<class OpResetGrad>
-inline void backward_reset_grad(OpResetGrad opResetGrad,
-                                hl_gru_value value,
-                                hl_gru_grad  grad,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-      hl_avx_gru_backward_reset_grad(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, active_gate);
-    } else {
-      hl_naive_gru_backward_reset_grad(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, active_gate);
-    }
-
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
-    }
-
-    grad.gateGrad += frameSize * 3;
-    grad.resetOutputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
-    }
-  }
-}
-
-#endif
-
-#endif  // HL_CPU_GRU_CUH_
diff --git a/paddle/legacy/cuda/include/hl_cpu_lstm.cuh b/paddle/legacy/cuda/include/hl_cpu_lstm.cuh
deleted file mode 100644
index 58a97d123..000000000
--- a/paddle/legacy/cuda/include/hl_cpu_lstm.cuh
+++ /dev/null
@@ -1,372 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_CPU_LSTM_CUH_
-#define HL_CPU_LSTM_CUH_
-
-#ifndef __NVCC__
-
-// using namespace hppl;
-
-template<class Op>
-void hl_naive_lstm_forward_one_sequence(Op op,
-                                        hl_lstm_value value,
-                                        int frameSize,
-                                        hl_activation_mode_t active_node,
-                                        hl_activation_mode_t active_gate,
-                                        hl_activation_mode_t active_state) {
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rCheckI;
-  real rCheckF;
-  real rCheckO;
-  real rState;
-  real rPrevState = 0;
-  real rStateAtv;
-  real rOut;
-
-  real *valueIn = value.gateValue;
-  real *valueIg = value.gateValue + frameSize;
-  real *valueFg = value.gateValue + frameSize * 2;
-  real *valueOg = value.gateValue + frameSize * 3;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
-
-    if (value.prevStateValue) {
-      rPrevState = value.prevStateValue[i];
-    }
-
-    op(rValueIn,
-       rValueIg,
-       rValueFg,
-       rValueOg,
-       rPrevState,
-       rState,
-       rStateAtv,
-       rOut,
-       rCheckI,
-       rCheckF,
-       rCheckO,
-       hppl::cpu::forward[active_node],
-       hppl::cpu::forward[active_gate],
-       hppl::cpu::forward[active_state]);
-
-    valueIn[i] = rValueIn;
-    valueIg[i] = rValueIg;
-    valueFg[i] = rValueFg;
-    valueOg[i] = rValueOg;
-    value.stateValue[i] = rState;
-    value.stateActiveValue[i] = rStateAtv;
-    value.outputValue[i] = rOut;
-  }
-}
-
-template<class Op>
-void hl_naive_lstm_backward_one_sequence(Op op,
-                                         hl_lstm_value value,
-                                         hl_lstm_grad grad,
-                                         int frameSize,
-                                         hl_activation_mode_t active_node,
-                                         hl_activation_mode_t active_gate,
-                                         hl_activation_mode_t active_state) {
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rGradIn;
-  real rGradIg;
-  real rGradFg;
-  real rGradOg;
-  real rPrevState = 0;
-  real rPrevStateGrad;
-  real rState;
-  real rStateGrad;
-  real rStateAtv;
-  real rOutputGrad;
-  real rCheckI;
-  real rCheckF;
-  real rCheckO;
-  real rCheckIGrad;
-  real rCheckFGrad;
-  real rCheckOGrad;
-
-  real *valueIn = value.gateValue;
-  real *valueIg = value.gateValue + frameSize;
-  real *valueFg = value.gateValue + frameSize * 2;
-  real *valueOg = value.gateValue + frameSize * 3;
-  real *gradIn = grad.gateGrad;
-  real *gradIg = grad.gateGrad + frameSize;
-  real *gradFg = grad.gateGrad + frameSize * 2;
-  real *gradOg = grad.gateGrad + frameSize * 3;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = value.checkIg[i];
-    rCheckF = value.checkFg[i];
-    rCheckO = value.checkOg[i];
-    rState = value.stateValue[i];
-    rStateAtv = value.stateActiveValue[i];
-    rOutputGrad = grad.outputGrad[i];
-    rStateGrad = grad.stateGrad[i];
-    if (value.prevStateValue) {
-      rPrevState = value.prevStateValue[i];
-    }
-
-    op(rValueIn,
-       rValueIg,
-       rValueFg,
-       rValueOg,
-       rGradIn,
-       rGradIg,
-       rGradFg,
-       rGradOg,
-       rPrevState,
-       rPrevStateGrad,
-       rState,
-       rStateGrad,
-       rStateAtv,
-       rOutputGrad,
-       rCheckI,
-       rCheckF,
-       rCheckO,
-       rCheckIGrad,
-       rCheckFGrad,
-       rCheckOGrad,
-       hppl::cpu::backward[active_node],
-       hppl::cpu::backward[active_gate],
-       hppl::cpu::backward[active_state]);
-
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
-    gradFg[i] = rGradFg;
-    gradOg[i] = rGradOg;
-    grad.stateGrad[i] = rStateGrad;
-
-    if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
-    }
-    if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
-  }
-}
-
-template<class Op>
-void hl_avx_lstm_forward_one_sequence(Op op,
-                                      hl_lstm_value value,
-                                      int frameSize,
-                                      hl_activation_mode_t active_node,
-                                      hl_activation_mode_t active_gate,
-                                      hl_activation_mode_t active_state) {
-#ifdef __AVX__
-  __m256 rValueIn;
-  __m256 rValueIg;
-  __m256 rValueFg;
-  __m256 rValueOg;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
-  __m256 rState;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
-  __m256 rStateAtv;
-  __m256 rOut;
-
-  __m256 *valueIn = (__m256*)value.gateValue;
-  __m256 *valueIg = (__m256*)(value.gateValue + frameSize);
-  __m256 *valueFg = (__m256*)(value.gateValue + frameSize * 2);
-  __m256 *valueOg = (__m256*)(value.gateValue + frameSize * 3);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = ((__m256*)value.checkIg)[i];
-    rCheckF = ((__m256*)value.checkFg)[i];
-    rCheckO = ((__m256*)value.checkOg)[i];
-
-    if (value.prevStateValue) {
-      rPrevState = ((__m256*)value.prevStateValue)[i];
-    }
-
-    op(rValueIn,
-       rValueIg,
-       rValueFg,
-       rValueOg,
-       rPrevState,
-       rState,
-       rStateAtv,
-       rOut,
-       rCheckI,
-       rCheckF,
-       rCheckO,
-       hppl::avx::forward[active_node],
-       hppl::avx::forward[active_gate],
-       hppl::avx::forward[active_state]);
-
-    valueIn[i] = rValueIn;
-    valueIg[i] = rValueIg;
-    valueFg[i] = rValueFg;
-    valueOg[i] = rValueOg;
-    ((__m256*)value.stateValue)[i] = rState;
-    ((__m256*)value.stateActiveValue)[i] = rStateAtv;
-    ((__m256*)value.outputValue)[i] = rOut;
-  }
-#endif
-}
-
-template<class Op>
-void hl_avx_lstm_backward_one_sequence(Op op,
-                                       hl_lstm_value value,
-                                       hl_lstm_grad grad,
-                                       int frameSize,
-                                       hl_activation_mode_t active_node,
-                                       hl_activation_mode_t active_gate,
-                                       hl_activation_mode_t active_state) {
-#ifdef __AVX__
-  __m256 rValueIn;
-  __m256 rValueIg;
-  __m256 rValueFg;
-  __m256 rValueOg;
-  __m256 rGradIn;
-  __m256 rGradIg;
-  __m256 rGradFg;
-  __m256 rGradOg;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
-  __m256 rPrevStateGrad;
-  __m256 rStateGrad;
-  __m256 rState;
-  __m256 rStateAtv;
-  __m256 rOutputGrad;
-  __m256 rCheckI;
-  __m256 rCheckF;
-  __m256 rCheckO;
-  __m256 rCheckIGrad;
-  __m256 rCheckFGrad;
-  __m256 rCheckOGrad;
-
-  __m256 *valueIn = (__m256*)value.gateValue;
-  __m256 *valueIg = (__m256*)(value.gateValue + frameSize);
-  __m256 *valueFg = (__m256*)(value.gateValue + frameSize * 2);
-  __m256 *valueOg = (__m256*)(value.gateValue + frameSize * 3);
-  __m256 *gradIn = (__m256*)grad.gateGrad;
-  __m256 *gradIg = (__m256*)(grad.gateGrad + frameSize);
-  __m256 *gradFg = (__m256*)(grad.gateGrad + frameSize * 2);
-  __m256 *gradOg = (__m256*)(grad.gateGrad + frameSize * 3);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueIn = valueIn[i];
-    rValueIg = valueIg[i];
-    rValueFg = valueFg[i];
-    rValueOg = valueOg[i];
-    rCheckI = ((__m256*)value.checkIg)[i];
-    rCheckF = ((__m256*)value.checkFg)[i];
-    rCheckO = ((__m256*)value.checkOg)[i];
-    rState = ((__m256*)value.stateValue)[i];
-    rStateAtv = ((__m256*)value.stateActiveValue)[i];
-    rOutputGrad = ((__m256*)grad.outputGrad)[i];
-    rStateGrad = ((__m256*)grad.stateGrad)[i];
-    if (value.prevStateValue) {
-      rPrevState = ((__m256*)value.prevStateValue)[i];
-    }
-
-    op(rValueIn,
-       rValueIg,
-       rValueFg,
-       rValueOg,
-       rGradIn,
-       rGradIg,
-       rGradFg,
-       rGradOg,
-       rPrevState,
-       rPrevStateGrad,
-       rState,
-       rStateGrad,
-       rStateAtv,
-       rOutputGrad,
-       rCheckI,
-       rCheckF,
-       rCheckO,
-       rCheckIGrad,
-       rCheckFGrad,
-       rCheckOGrad,
-       hppl::avx::backward[active_node],
-       hppl::avx::backward[active_gate],
-       hppl::avx::backward[active_state]);
-
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
-    gradFg[i] = rGradFg;
-    gradOg[i] = rGradOg;
-    ((__m256*)grad.stateGrad)[i] = rStateGrad;
-
-    if (grad.prevStateGrad) ((__m256*)grad.prevStateGrad)[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) ((__m256*)grad.checkIgGrad)[i] += rCheckIGrad;
-      if (grad.checkFgGrad) ((__m256*)grad.checkFgGrad)[i] += rCheckFGrad;
-    }
-    if (grad.checkOgGrad) ((__m256*)grad.checkOgGrad)[i] += rCheckOGrad;
-  }
-#endif
-}
-
-template<class Op>
-void hl_cpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-    hl_avx_lstm_forward_one_sequence(op, value, frameSize,
-        active_node, active_gate, active_state);
-  } else {
-    hl_naive_lstm_forward_one_sequence(op, value, frameSize,
-        active_node, active_gate, active_state);
-  }
-}
-
-template<class Op>
-void hl_cpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (sizeof(real) == 4)) {
-    hl_avx_lstm_backward_one_sequence(op, value, grad, frameSize,
-        active_node, active_gate, active_state);
-  } else {
-    hl_naive_lstm_backward_one_sequence(op, value, grad, frameSize,
-        active_node, active_gate, active_state);
-  }
-}
-
-#endif
-
-#endif /* HL_CPU_LSTM_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
deleted file mode 100644
index 4db9bb74e..000000000
--- a/paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CPU_MATRIX_KERNEL_CUH_
-#define HL_CPU_MATRIX_KERNEL_CUH_
-
-#include <stdio.h>
-#include "hl_base.h"
-
-#ifndef __CUDA_ARCH__
-#include "hl_cpu_matrix_kernel_detail.cuh"
-#endif
-
-/**
- * @brief   cpu element wise unary operator.
- */
-template <class T, class Op>
-void hl_cpu_apply_unary_op(Op op, T* A_h, int dimM, int dimN, int lda) {
-  for (int i = 0; i < dimM; i ++) {
-    for (int j = 0; j < dimN; j++) {
-      op.cpuOperator(A_h[i*lda + j]);
-    }
-  }
-}
-
-/**
- * @brief   cpu element wise binary operator.
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_cpu_apply_binary_op(Op op,
-                            T* A_h,
-                            T* B_h,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {
-  for (int i = 0; i < dimM; i ++) {
-    for (int j = 0; j < dimN; j++) {
-      if (BAsRowVector == 0 && BAsColVector == 0) {
-        op.cpuOperator(A_h[i * lda + j], B_h[i * ldb + j]);
-      } else if (BAsRowVector == 1 && BAsColVector == 0) {
-        op.cpuOperator(A_h[i * lda + j], B_h[j]);
-      } else if (BAsRowVector == 0 && BAsColVector == 1) {
-        op.cpuOperator(A_h[i * lda + j], B_h[i * ldb]);
-      } else {
-        op.cpuOperator(A_h[i * lda + j], B_h[0]);
-      }
-    }
-  }
-}
-
-/**
- * @brief   cpu element wise ternary operator.
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_cpu_apply_ternary_op(Op op,
-                             T* A_h,
-                             T* B_h,
-                             T* C_h,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {
-  for (int i = 0; i < dimM; i ++) {
-    for (int j = 0; j < dimN; j++) {
-      if (CAsRowVector == 0 && CAsColVector == 0) {
-        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[i*ldc + j]);
-      } else if (CAsRowVector == 1 && CAsColVector == 0) {
-        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[j]);
-      } else if (CAsRowVector == 0 && CAsColVector == 1) {
-        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[i*ldc]);
-      } else {
-        op.cpuOperator(A_h[i*lda + j], B_h[i*ldb + j], C_h[0]);
-      }
-    }
-  }
-}
-
-/**
- * @brief   cpu element wise quaternary operator.
- */
-template <class T, class Op>
-void hl_cpu_apply_quaternary_op(Op op,
-                                T* A_h,
-                                T* B_h,
-                                T* C_h,
-                                T* D_h,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {
-  for (int i = 0; i < dimM; i ++) {
-    for (int j = 0; j < dimN; j++) {
-      op.cpuOperator(A_h[i*lda + j],
-                     B_h[i*ldb + j],
-                     C_h[i*ldc + j],
-                     D_h[i*ldd + j]);
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-#ifndef __CUDA_ARCH__
-  if (!Agg::sse || !Op::sse || !Saver::sse) {
-    hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda);
-  } else {
-    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))) {
-      hl_sse_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda);
-    } else {
-      hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda);
-    }
-  }
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-#ifndef __CUDA_ARCH__
-  if (!Agg::sse || !Op::sse || !Saver::sse) {
-    hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda, B, ldb);
-  } else {
-    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))
-      && hl_check_align(B) && hl_check_align(ldb*sizeof(real))) {
-      hl_sse_matrix_row_op(
-        agg, op, sv, dimM, dimN, dst, ld, A, lda, B, ldb);
-    } else {
-      hl_matrix_row_op(agg, op, sv, dimM, dimN, dst, ld, A, lda, B, ldb);
-    }
-  }
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-#ifndef __CUDA_ARCH__
-  if (!Agg::sse || !Op::sse || !Saver::sse) {
-    hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))
-      && hl_check_align(dst)) {
-      hl_sse_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda);
-    } else {
-      hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda);
-    }
-  }
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-#ifndef __CUDA_ARCH__
-  if (!Agg::sse || !Op::sse || !Saver::sse) {
-    hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    if (hl_check_align(A) && hl_check_align(lda*sizeof(real))
-      && hl_check_align(B) && hl_check_align(ldb*sizeof(real))
-      && hl_check_align(dst)) {
-      hl_sse_matrix_column_op(
-        agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-    } else {
-      hl_matrix_column_op(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-    }
-  }
-#endif
-}
-
-#endif /* HL_CPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
deleted file mode 100644
index 54a749b99..000000000
--- a/paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
+++ /dev/null
@@ -1,310 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_KERNEL_DETAIL_CUH_
-#define HL_MATRIX_KERNEL_DETAIL_CUH_
-
-#include "hl_matrix_type.cuh"
-
-inline bool hl_check_align(size_t size) {
-  return !(size & (VECTOR_SIZE - 1));
-}
-
-inline bool hl_check_align(void *ptr) {
-  return hl_check_align(reinterpret_cast<size_t>(ptr));
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_row_op(Agg agg, Op op, Saver sv,
-                      int dimM, int dimN,
-                      real *dst, int ld,
-                      real *A, int lda,
-                      real *B, int ldb) {
-  for (int i = 0; i < dimM; i++) {
-    real tmp = agg.init();
-    for (int j = 0; j < dimN; j++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[i*ld] = sv(dst[i*ld], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_matrix_column_op(Agg agg, Op op, Saver sv,
-                         int dimM, int dimN,
-                         real *dst,
-                         real *A, int lda,
-                         real *B, int ldb) {
-  for (int j = 0; j < dimN; j++) {
-    real tmp = agg.init();
-    for (int i = 0; i < dimM; i++) {
-        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
-    }
-    dst[j] = sv(dst[j], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-  for (int i = 0; i < dimM; i++, A += lda) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
-        mm = agg.vecOp(mm, op.vecOp(*a));
-    }
-
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-          tmp = agg(tmp, op(a[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
-    vecType mm = VECTOR_SET(agg.init());
-    vecType *a = (vecType*)(A);
-    vecType *b = (vecType*)(B);
-    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
-        mm = agg.vecOp(mm, op.vecOp(*a, *b));
-    }
-
-    int rem = dimN % VECTOR_LEN;
-    if (rem) {
-      real tmp = hl_agg_op(agg, mm);
-      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
-      for (int j = 0; j < rem; j++) {
-          tmp = agg(tmp, op(a[j], b[j]));
-      }
-      dst[i*ld] = sv(dst[i*ld], tmp);
-    } else {
-        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
-    }
-  }
-}
-
-/*
- * MaxRow greater than or equal dimN
- * dimN is multiples of VECTOR_LEN
- * so rem <= MaxRow / VECTOR_LEN
- */
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-    }
-  }
-
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
-  }
-}
-
-/*
- * dimN is multiples of VECTOR_LEN
- * dimN greater than Step
- */
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
-      }
-    }
-
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-}
-
-template <int MaxRow, class Agg, class Op, class Saver>
-void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
-                               int dimM, int dimN,
-                               real *dst,
-                               real *A, int lda,
-                               real *B, int ldb) {
-  vecType mm[MaxRow / VECTOR_LEN];
-  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
-    mm[n] = VECTOR_SET(agg.init());
-  }
-
-  for (int i = 0; i < dimM; i++) {
-    vecType *a = (vecType*)(A + i * lda);
-    vecType *b = (vecType*)(B + i * ldb);
-    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-    }
-  }
-
-  vecType *result = (vecType*)(dst);
-  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
-    result[n] = sv.vecOp(result[n], mm[n]);
-  }
-
-  int rem = dimN % VECTOR_LEN;
-  if (rem) {
-    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
-    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
-  }
-}
-
-template <int Step, class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
-    vecType mm[Step / VECTOR_LEN];
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      mm[n] = VECTOR_SET(agg.init());
-    }
-
-    for (int i = 0; i < dimM; i++) {
-      vecType *a = (vecType*)(A + i * lda);
-      vecType *b = (vecType*)(B + i * ldb);
-      for (int n = 0; n < Step / VECTOR_LEN; n++) {
-        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
-      }
-    }
-
-    vecType *result = (vecType*)(dst);
-    for (int n = 0; n < Step / VECTOR_LEN; n++) {
-      result[n] = sv.vecOp(result[n], mm[n]);
-    }
-  }
-
-  int remRow = dimN % Step;
-  if (remRow) {
-    hl_sse_column_op_with_rem<Step>(
-        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
-  }
-}
-
-template <class Agg, class Op, class Saver>
-void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-  if (dimN <= 16) {
-    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 32) {
-    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else if (dimN <= 1024 || dimM <= 512) {
-    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-}
-
-#endif /* HL_MATRIX_KERNEL_DETAIL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_cpu_scalar.cuh b/paddle/legacy/cuda/include/hl_cpu_scalar.cuh
deleted file mode 100644
index 939302e97..000000000
--- a/paddle/legacy/cuda/include/hl_cpu_scalar.cuh
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CPU_SCALAR_CUH_
-#define HL_CPU_SCALAR_CUH_
-
-#define VECTOR_SIMD false
-#define VECTOR_SET  hl_vec_set
-
-#ifndef PADDLE_TYPE_DOUBLE
-/* size of float */
-#define VECTOR_SIZE 4
-#else
-/* size of double */
-#define VECTOR_SIZE 8
-#endif
-
-typedef real vecType;
-
-/* Consider a real as a vector */
-#define VECTOR_LEN  1
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  return mm;
-}
-
-INLINE real hl_vec_set(const real r) {
-  return r;
-}
-
-INLINE real hl_vec_classification_error(const real a,
-                                        const real b,
-                                        const real p,
-                                        const real r) {
-  return ((a > p) == (b > p)) ? 0.0f : 1.0f;
-}
-
-#endif  // HL_CPU_SCALAR_CUH_
diff --git a/paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
deleted file mode 100644
index e54e0f464..000000000
--- a/paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CPU_SIMD_NEON_CUH_
-#define HL_CPU_SIMD_NEON_CUH_
-
-#include <arm_neon.h>
-
-#define VECTOR_SIMD true
-#define VECTOR_SIZE 16
-#define VECTOR_SET  hl_vec_set
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-typedef float32x4_t vecType;
-
-/* number of float in vector */
-#define VECTOR_LEN  4
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  float32x4_t rev = vrev64q_f32(mm);
-  float32x4_t tmp1 = agg.vecOp(rev, rev);
-  float32x2_t lo = vget_high_f32(rev);
-  float32x2_t hi = vget_low_f32(rev);
-  float32x4_t tmp2 = vcombine_f32(hi, lo);
-  float32x4_t ret = agg.vecOp(tmp1, tmp2);
-
-  return vgetq_lane_f32(ret, 0);
-}
-
-inline float32x4_t hl_vec_set(const real f) {
-  return vdupq_n_f32(f);
-}
-
-inline float32x4_t hl_vec_classification_error(const float32x4_t a,
-                                               const float32x4_t b,
-                                               const float32x4_t p,
-                                               const float32x4_t r) {
-  uint32x4_t tmp1 = vcgtq_f32(a, p);
-  uint32x4_t tmp2 = vcgtq_f32(b, p);
-  uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
-  return vcvtq_f32_u32(vandq_u32(tmp3, vcvtq_u32_f32(r)));
-}
-
-#else
-
-#ifdef __aarch64__
-typedef float64x2_t vecType;
-
-/* number of float in vector */
-#define VECTOR_LEN  2
-#define VECTOR_SET  vdupq_n_f64
-
-#error To be implemented
-#else
-#error NEON instructions does not support double precision
-#endif  // __aarch64__
-
-#endif
-
-#endif  // HL_CPU_SIMD_NEON_CUH_
diff --git a/paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
deleted file mode 100644
index 20c37d4dd..000000000
--- a/paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CPU_SIMD_SSE_CUH_
-#define HL_CPU_SIMD_SSE_CUH_
-
-#include <mmintrin.h>
-#include <xmmintrin.h>
-#include <emmintrin.h>
-
-#define VECTOR_SIMD true
-#define VECTOR_SIZE 16
-#define VECTOR_SET  hl_vec_set
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-typedef __m128  vecType;
-
-/* number of float in vector */
-#define VECTOR_LEN  4
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  __m128 lo = _mm_unpacklo_ps(mm, mm);
-  __m128 hi = _mm_unpackhi_ps(mm, mm);
-  __m128 tmp1 = agg.vecOp(lo, hi);
-  __m128 tmp2 = _mm_movehl_ps(tmp1, tmp1);
-  __m128 ret = agg.vecOp(tmp1, tmp2);
-
-  return _mm_cvtss_f32(ret);
-}
-
-inline __m128 hl_vec_set(const real f) {
-  return _mm_set_ps1(f);
-}
-
-inline __m128 hl_vec_classification_error(const __m128 a,
-                                          const __m128 b,
-                                          const __m128 p,
-                                          const __m128 r) {
-  __m128 tmp1 = _mm_cmpgt_ps(a, p);
-  __m128 tmp2 = _mm_cmpgt_ps(b, p);
-  __m128 tmp3 = _mm_xor_ps(tmp1, tmp2);
-  return _mm_and_ps(tmp3, r);
-}
-
-#else
-
-typedef __m128d vecType;
-
-/* number of double in vector */
-#define VECTOR_LEN  2
-
-template <class Agg>
-inline real hl_agg_op(Agg agg, vecType mm) {
-  __m128d lo = _mm_unpacklo_pd(mm, mm);
-  __m128d hi = _mm_unpackhi_pd(mm, mm);
-  __m128d ret = agg.vecOp(lo, hi);
-
-  return _mm_cvtsd_f64(ret);
-}
-
-inline __m128d hl_vec_set(const real d) {
-#if defined(__APPLE__) || defined(__OSX__)
-  return _mm_set1_pd(d);
-#else
-  return _mm_set_pd1(d);
-#endif
-}
-
-inline __m128d hl_vec_classification_error(const __m128d a,
-                                           const __m128d b,
-                                           const __m128d p,
-                                           const __m128d r) {
-  __m128d tmp1 = _mm_cmpgt_pd(a, p);
-  __m128d tmp2 = _mm_cmpgt_pd(b, p);
-  __m128d tmp3 = _mm_xor_pd(tmp1, tmp2);
-  return _mm_and_pd(tmp3, r);
-}
-
-#endif
-
-#endif  // HL_CPU_SIMD_SSE_CUH_
diff --git a/paddle/legacy/cuda/include/hl_cuda.h b/paddle/legacy/cuda/include/hl_cuda.h
deleted file mode 100644
index 70efcccb8..000000000
--- a/paddle/legacy/cuda/include/hl_cuda.h
+++ /dev/null
@@ -1,345 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_H_
-#define HL_CUDA_H_
-
-#include <string>
-#include "hl_base.h"
-
-/**
- * @brief   HPPL event.
- */
-typedef struct _hl_event_st *hl_event_t;
-
-/**
- * @brief return cuda runtime api version.
- */
-extern int hl_get_cuda_lib_version();
-
-/**
- * @brief   HPPL strat(Initialize all GPU).
- */
-extern void hl_start();
-
-/**
- * @brief   HPPL start(Initialize the specific GPU).
- *
- * @param[in]   device  device id(0, 1......).
- *                      if device is NULL, will start all GPU.
- * @param[in]   number  number of devices.
- */
-extern void hl_specify_devices_start(int *device, int number);
-
-/**
- * @brief   Queries if a device may directly access a peer device's memory.
- *
- * @param[in]   device      Device from which allocations on peerDevice are
- *                          to be directly accessed.
- * @param[in]   peerDevice  Device on which the allocations to be directly
- *                          accessed by device reside.
- *
- * @return  Returns true if device is capable of directly accessing memory
- *          from peerDevice and false otherwise.
- */
-bool hl_device_can_access_peer(int device, int peerDevice);
-
-/**
- * @brief   Enables direct access to memory allocations on a peer device.
- *
- * @param[in]   peerDevice  Peer device to enable direct access to from the
- *                          current device
- */
-void hl_device_enable_peer_access(int peerDevice);
-
-/**
- * @brief   Init a work thread.
- *
- * @param[in]   device  device id.
- */
-extern void hl_init(int device);
-
-/**
- * @brief   Finish a work thread.
- */
-extern void hl_fini();
-
-/**
- * @brief   Set synchronous/asynchronous flag.
- *
- * @param[in]   flag    true(default), set synchronous flag.
- *                      false, set asynchronous flag.
- *
- *
- * @note    This setting is only valid for the current worker thread.
- */
-extern void hl_set_sync_flag(bool flag);
-
-/**
- * @brief   Get synchronous/asynchronous flag.
- *
- * @return  Synchronous call true.
- *          Asynchronous call false.
- *
- */
-extern bool hl_get_sync_flag();
-
-/**
- * @brief   Returns the number of compute-capable devices.
- *
- */
-extern int hl_get_device_count();
-
-/**
- * @brief   Set device to be used.
- *
- * @param[in]   device  device id.
- *
- */
-extern void hl_set_device(int device);
-
-/**
- * @brief   Returns which device is currently being used.
- *
- * @return  device  device id.
- *
- */
-extern int hl_get_device();
-
-/**
- * @brief   Allocate device memory.
- *
- * @param[in]   size     size in bytes to copy.
- *
- * @return      dest_d   pointer to device memory.
- */
-extern void *hl_malloc_device(size_t size);
-
-/**
- * @brief   Free device memory.
- *
- * @param[in]   dest_d  pointer to device memory.
- *
- */
-extern void hl_free_mem_device(void *dest_d);
-
-/**
- * @brief   Allocate host page-lock memory.
- *
- * @param[in]   size     size in bytes to copy.
- *
- * @return      dest_h   pointer to host memory.
- */
-extern void *hl_malloc_host(size_t size);
-
-/**
- * @brief   Free host page-lock memory.
- *
- * @param[in]   dest_h  pointer to host memory.
- *
- */
-extern void hl_free_mem_host(void *dest_h);
-
-/**
- * @brief   Copy data.
- *
- * @param[in]   dst     dst memory address(host or device).
- * @param[in]   src     src memory address(host or device).
- * @param[in]   size    size in bytes to copy.
- *
- */
-extern void hl_memcpy(void *dst, void *src, size_t size);
-
-/**
- * @brief   Set device memory to a value.
- *
- * @param[in]   dest_d  pointer to device memory.
- * @param[in]   value   value to set for each byte of specified memory.
- * @param[in]   size    size in bytes to set.
- *
- */
-extern void hl_memset_device(void *dest_d, int value, size_t size);
-
-/**
- * @brief   Copy host memory to device memory.
- *
- * @param[in]   dest_d  dst memory address.
- * @param[in]   src_h   src memory address.
- * @param[in]   size    size in bytes to copy.
- *
- */
-extern void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size);
-
-/**
- * @brief   Copy device memory to host memory.
- *
- * @param[in]   dest_h  dst memory address.
- * @param[in]   src_d   src memory address.
- * @param[in]   size    size in bytes to copy.
- *
- */
-extern void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size);
-
-/**
- * @brief   Copy device memory to device memory.
- *
- * @param[in]   dest_d  dst memory address.
- * @param[in]   src_d   src memory address.
- * @param[in]   size    size in bytes to copy.
- *
- */
-extern void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size);
-
-/**
- * @brief   Generate uniformly distributed floats (0, 1.0].
- *
- * @param[in]   dest_d  pointer to device memory to store results.
- * @param[in]   num     number of floats to generate.
- *
- */
-extern void hl_rand(real *dest_d, size_t num);
-
-/**
- * @brief   Set the seed value of the random number generator.
- *
- * @param[in]   seed    seed value.
- */
-extern void hl_srand(unsigned int seed);
-
-/**
- * @brief   Copy data.
- *
- * @param[in]   dst     dst memory address(host or device).
- * @param[in]   src     src memory address(host or device).
- * @param[in]   size    size in bytes to copy.
- * @param[in]   stream  stream id.
- */
-extern void hl_memcpy_async(void *dst,
-                            void *src,
-                            size_t size,
-                            hl_stream_t stream);
-
-/**
- * @brief   Waits for stream tasks to complete.
- *
- * @param[in]   stream  stream id.
- */
-extern void hl_stream_synchronize(hl_stream_t stream);
-
-/**
- * @brief   Creates an event object.
- *
- * @param[out]   event  New event.
- */
-extern void hl_create_event(hl_event_t *event);
-
-/**
- * @brief   Destroys an event object.
- *
- * @param[in]   event   Event to destroy.
- */
-extern void hl_destroy_event(hl_event_t event);
-
-/**
- * @brief   Computes the elapsed time between events.
- *
- * @param[in]   start  Starting event.
- * @param[in]   end    Ending event.
- *
- * @return      time   Time between start and end in ms.
- */
-extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
-
-/**
- * @brief   Records an event.
- *
- * @param[in]   stream   Stream in which to insert event.
- * @param[in]   event    Event waiting to be recorded as completed.
- *
- */
-extern void hl_stream_record_event(hl_stream_t stream, hl_event_t event);
-
-/**
- * @brief   Make a compute stream wait on an event.
- *
- * @param[in]   stream   Stream in which to insert event.
- * @param[in]   event    Event to wait on.
- *
- */
-extern void hl_stream_wait_event(hl_stream_t stream, hl_event_t event);
-
-/**
- * @brief   Wait for an event to complete.
- *
- * @param[in]   event       event to wait for.
- *
- */
-extern void hl_event_synchronize(hl_event_t event);
-
-/**
- * @brief   Sets block flags to be used for device executions.
- *
- * @note    This interface needs to be called before hl_start.
- */
-extern void hl_set_device_flags_block();
-
-/**
- * @brief   Returns the last error string from a cuda runtime call.
- */
-extern const char *hl_get_device_error_string();
-
-/**
- * @brief     Returns the last error string from a cuda runtime call.
- *
- * @param[in] err  error number.
- *
- * @see       hl_get_device_last_error()
- */
-extern const char *hl_get_device_error_string(size_t err);
-
-/**
- * @brief   Returns the last error number.
- *
- * @return  error number.
- *
- * @see     hl_get_device_error_string()
- */
-extern int hl_get_device_last_error();
-
-/**
- * @brief   check cuda event is ready
- *
- * @param[in]  event        cuda event to query.
- *
- * @return     true    cuda event is ready.
- *             false   cuda event is not ready.
- */
-extern bool hl_cuda_event_is_ready(hl_event_t event);
-
-/**
- * @brief   hppl device synchronization.
- */
-extern void hl_device_synchronize();
-
-/**
- * @brief   gpu profiler start
- */
-extern void hl_profiler_start();
-
-/**
- * @brief   gpu profiler stop
- */
-extern void hl_profiler_end();
-
-#endif  // HL_CUDA_H_
diff --git a/paddle/legacy/cuda/include/hl_cuda.ph b/paddle/legacy/cuda/include/hl_cuda.ph
deleted file mode 100644
index 7c4465e51..000000000
--- a/paddle/legacy/cuda/include/hl_cuda.ph
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_CUDA_PH_
-#define HL_CUDA_PH_
-
-#include <stdio.h>
-#include <pthread.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
-#include "hl_base.h"
-
-/**
- * @brief   hppl event.
- * @param   cuda event.
- */
-struct _hl_event_st {
-    cudaEvent_t     cu_event;       /* cuda event */
-};
-
-/**
- * @brief   global device resources.
- *
- * @param   *stream         device global stream.
- * @param   handle          devcie cublas handle.
- * @param   gen             device curand generator.
- * @param   cudnn_handle    cudnn handle.
- * @param   *gen_mutex      gen lock.
- */
-typedef struct {
-    cudaStream_t        *stream;
-    cublasHandle_t      handle;
-    curandGenerator_t   gen;
-    cudnnHandle_t       cudnn_handle;
-    pthread_mutex_t     *gen_mutex;
-}_global_device_resources, *global_device_resources;
-
-/*
- * @brief   thread device resources.
- *
- * @param   *stream         device thread stream.
- * @param   *gpu_mem        device memory.
- * @param   *cpu_mem        cpu memory.
- * @param    mem_event      device memory lock.
- */
-typedef struct {
-    cudaStream_t   *stream;
-    real           *gpu_mem;
-    real           *cpu_mem;
-    cudaEvent_t    mem_event;
-}_thread_device_resources, *thread_device_resources;
-
-/*
- * @brief   hppl device properties.
- *
- * @param   device            device id.
- * @param   device_type       0.Nvidia, 1.AMD, 2.Intel.
- * @param   device_name[256]  device name.
- * @param   device_mem        total global memory.
- * @param   major             device compute capability.
- * @param   minor             device compute capability.
- * @param   is_local          local device or not.
- * @param   device_resources  device resources.
- */
-typedef struct {
-    int device;
-    int device_type;
-    char device_name[256];
-    size_t device_mem;
-    int major;
-    int minor;
-    bool is_local;
-    global_device_resources device_resources;
-} _hl_device_prop, *hl_device_prop;
-
-/**
- * @brief   thread device resource allocation.
- *
- * create cuda stream and cuda event, allocate gpu
- * memory and host page-lock memory for threads.
- *
- * @param[in]   device      device number.
- * @param[out]  device_res  device properties.
- */
-extern void hl_create_thread_resources(int device,
-                                       thread_device_resources device_res);
-
-/**
- * @brief   global device resource allocation.
- *
- * create cuda stream, initialize cublas, curand and cudnn.
- *
- * @param[out]   device_prop  device properties.
- */
-extern void hl_create_global_resources(hl_device_prop device_prop);
-
-#endif  /* HL_CUDA_PH_ */
diff --git a/paddle/legacy/cuda/include/hl_cuda_cublas.h b/paddle/legacy/cuda/include/hl_cuda_cublas.h
deleted file mode 100644
index 3959f8167..000000000
--- a/paddle/legacy/cuda/include/hl_cuda_cublas.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUBLAS_H_
-#define HL_CUDA_CUBLAS_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Matrix transpose: C_d = T(A_d)
- *
- * @param[in]   A_d     input matrix (dimM x dimN).
- * @param[out]  C_d     output matrix (dimN x dimM).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- * @param[in]   lda     the first dimension of A_d.
- * @param[in]   ldc     the first dimension of C_d.
- *
- */
-extern void hl_matrix_transpose(
-    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
-
-/*
- * @brief Matrix transpose, while lda = dimN, ldc = dimM.
- *
- * @param[in]   A_d     input matrix (dimM x dimN).
- * @param[out]  C_d     output matrix (dimN x dimM).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
-
-/*
- * @brief Matrix inverse
- *
- * @param[in]   A_d    input matrix (dimN x dimN).
- * @param[out]  C_d    output matrix (dimN x dimN).
- * @param[in]   dimN   matrix height = matrix width
- * @param[in]   lda    the first dimension of A_d
- * @param[in]   ldc    the first dimension of C_d
- *
- */
-extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
- *
- * @param[in]   A_d     input.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     input.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     output.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- * @param[in]   lda     the first dimension of A_d.
- * @param[in]   ldb     the first dimension of B_d.
- * @param[in]   ldc     the first dimension of C_d.
- *
- */
-extern void hl_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta,
-                          int lda,
-                          int ldb,
-                          int ldc);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
- *
- * @param[in]   A_d     input.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     input.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     output.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *
- */
-extern void hl_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta);
-
-/**
- * @brief   This function performs the matrix-vector multiplication.
- *          C_d = alpha*op(A_d)*B_d + beta*C_d
- *
- * @param[in]     A_d    matrix.
- * @param[in]     trans  operation op(A) that is non-or transpose.
- * @param[in]     B_d    vector with dimN(dimM) elements
- *                       if trans==HPPL_OP_N(HPPL_OP_T).
- * @param[in,out] C_d    vector with dimM(dimN) elements
- *                       if trans==HPPL_OP_N(HPPL_OP_T).
- * @param[in]     dimM   number of rows of matrix A_d.
- * @param[in]     dimN   number of columns of matrix A_d.
- * @param[in]     alpha  scalar used for multiplication.
- * @param[in]     beta   scalar used for multiplication.
- * @param[in]     lda    the first dimension of A_d.
- * @param[in]     incb   increase B_d size for compaction.
- * @param[in]     incc   increase C_d size for compaction.
- *
- */
-
-extern void hl_matrix_mul_vector(real *A_d,
-                                 hl_trans_op_t trans,
-                                 real *B_d,
-                                 real *C_d,
-                                 int dimM,
-                                 int dimN,
-                                 real alpha,
-                                 real beta,
-                                 int lda,
-                                 int incb,
-                                 int incc);
-
-/**
- * @brief   This function performs the matrix-vector multiplication.
- *          C_d = alpha*op(A_d)*B_d + beta*C_d
- *
- * @param[in]     A_d    matrix.
- * @param[in]     trans  operation op(A) that is non-or transpose.
- * @param[in]     B_d    vector with dimN(dimM) elements
- *                       if trans==HPPL_OP_N(HPPL_OP_T).
- * @param[in,out] C_d    vector with dimM(dimN) elements
- *                       if trans==HPPL_OP_N(HPPL_OP_T).
- * @param[in]     dimM   number of rows of matrix A_d.
- * @param[in]     dimN   number of columns of matrix A_d.
- * @param[in]     alpha  scalar used for multiplication.
- * @param[in]     beta   scalar used for multiplication.
- *
- */
-extern void hl_matrix_mul_vector(real *A_d,
-                                 hl_trans_op_t trans,
-                                 real *B_d,
-                                 real *C_d,
-                                 int dimM,
-                                 int dimN,
-                                 real alpha,
-                                 real beta);
-
-#endif /* HL_CUDA_CUBLAS_H_ */
diff --git a/paddle/legacy/cuda/include/hl_cuda_cudnn.h b/paddle/legacy/cuda/include/hl_cuda_cudnn.h
deleted file mode 100644
index 4664e4144..000000000
--- a/paddle/legacy/cuda/include/hl_cuda_cudnn.h
+++ /dev/null
@@ -1,516 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUDNN_H_
-#define HL_CUDA_CUDNN_H_
-
-#include "hl_base.h"
-
-/*
- *  hppl pooling mode
- */
-typedef enum {
-  HL_POOLING_MAX = 0,
-  // average does not include padded values
-  HL_POOLING_AVERAGE = 1,
-  // average includes padded values
-  HL_POOLING_AVERAGE_INCLUDE_PADDING = 2,
-  HL_POOLING_END
-} hl_pooling_mode_t;
-
-/**
- * @brief return cudnn lib version
- */
-
-extern int hl_get_cudnn_lib_version();
-
-/**
- * @brief   hppl image descriptor.
- */
-typedef struct _hl_tensor_descriptor* hl_tensor_descriptor;
-
-/**
- * @brief   hppl pooling descriptor.
- */
-typedef struct _hl_pooling_descriptor* hl_pooling_descriptor;
-
-/**
- * @brief   hppl filter descriptor.
- */
-typedef struct _hl_filter_descriptor* hl_filter_descriptor;
-
-/**
- * @brief   hppl filter descriptor.
- */
-typedef struct _hl_convolution_descriptor* hl_convolution_descriptor;
-
-/**
- * @brief   create image descriptor.
- *
- * @param[out]   image_desc     image descriptor.
- *
- */
-extern void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc);
-
-/**
- * @brief   reshape image descriptor.
- *
- * @param[in,out]   image_desc    image descriptor.
- * @param[in]       batch_size    input batch size.
- * @param[in]       feature_maps  image feature maps.
- * @param[in]       height        image height.
- * @param[in]       width         image width.
- */
-extern void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                              int batch_size,
-                              int feature_maps,
-                              int height,
-                              int width);
-
-/**
- * @brief   reshape image descriptor.
- *
- * @param[in,out]   image_desc    image descriptor.
- * @param[in]       batch_size    input batch size.
- * @param[in]       feature_maps  image feature maps.
- * @param[in]       height        image height.
- * @param[in]       width         image width.
- * @param[in]       nStride       stride between two consecutive images.
- * @param[in]       cStride       stride between two consecutive feature maps.
- * @param[in]       hStride       stride between two consecutive rows.
- * @param[in]       wStride       stride between two consecutive columns.
- *
- */
-extern void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                              int batch_size,
-                              int feature_maps,
-                              int height,
-                              int width,
-                              int nStride,
-                              int cStride,
-                              int hStride,
-                              int wStride);
-
-/**
- * @brief   destroy image descriptor.
- *
- * @param[in]   image_desc  hppl image descriptor.
- */
-extern void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc);
-
-/**
- * @brief   create pooling descriptor.
- *
- * @param[out]  pooling_desc    pooling descriptor.
- * @param[in]   mode            pooling mode.
- * @param[in]   height          height of the pooling window.
- * @param[in]   width           width of the pooling window.
- * @param[in]   height_padding  padding height.
- * @param[in]   width_padding   padding width.
- * @param[in]   stride_height   pooling vertical stride.
- * @param[in]   stride_width    pooling horizontal stride.
- */
-extern void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
-                                         hl_pooling_mode_t mode,
-                                         int height,
-                                         int width,
-                                         int height_padding,
-                                         int width_padding,
-                                         int stride_height,
-                                         int stride_width);
-
-/**
- * @brief   destroy pooling descriptor.
- *
- * @param[in]   pooling_desc  hppl pooling descriptor.
- *
- */
-extern void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc);
-
-/**
- * @brief   pooling forward(calculate output image).
- *
- * @param[in]   input           input image descriptor.
- * @param[in]   input_image     input image data.
- * @param[in]   output          output image descriptor.
- * @param[out]  output_image    output image data.
- * @param[in]   pooling         pooling descriptor.
- *
- */
-extern void hl_pooling_forward(hl_tensor_descriptor input,
-                               real* input_image,
-                               hl_tensor_descriptor output,
-                               real* output_image,
-                               hl_pooling_descriptor pooling);
-
-/**
- * @brief   pooling backward(calculate input image gradient).
- *
- * @param[in]   input               input image descriptor.
- * @param[in]   input_image         input image data.
- * @param[in]   input_image_grad    input image gradient data.
- * @param[in]   output              output image descriptor.
- * @param[in]   output_image        output image data.
- * @param[out]  output_image_grad   output image gradient data.
- * @param[in]   pooling             pooling descriptor.
- *
- */
-extern void hl_pooling_backward(hl_tensor_descriptor input,
-                                real* input_image,
-                                real* input_image_grad,
-                                hl_tensor_descriptor output,
-                                real* output_image,
-                                real* output_image_grad,
-                                hl_pooling_descriptor pooling);
-
-/**
- * @brief   create filter descriptor.
- *
- * @param[out]  filter                  filter descriptor.
- * @param[in]   input_feature_maps      input image feature maps.
- * @param[in]   output_feature_maps     output image feature maps.
- * @param[in]   height                  filter height.
- * @param[in]   width                   filter width.
- *
- */
-extern void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                        int input_feature_maps,
-                                        int output_feature_maps,
-                                        int height,
-                                        int width);
-
-/**
- * @brief    convolution workspace configuration
- *
- * @param[in]    input                image descriptor
- * @param[in]    output               image descriptor
- * @param[in]    filter               filter descriptor
- * @param[in]    conv                 convolution descriptor
- * @param[out]   convFwdAlgo          forward algorithm
- * @param[out]   fwdLimitBytes        forward workspace size
- * @param[out]   convBwdDataAlgo      backward data algorithm
- * @param[out]   bwdDataLimitBytes    backward data workspace size
- * @param[out]   convBwdFilterAlgo    backward filter algorithm
- * @param[out]   bwdFilterLimitBytes  backward filter workspace size
- *
- */
-extern void hl_conv_workspace(hl_tensor_descriptor input,
-                              hl_tensor_descriptor output,
-                              hl_filter_descriptor filter,
-                              hl_convolution_descriptor conv,
-                              int* convFwdAlgo,
-                              size_t* fwdLimitBytes,
-                              int* convBwdDataAlgo,
-                              size_t* bwdDataLimitBytes,
-                              int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes,
-                              bool useDilation);
-
-/**
- * @brief   destroy filter descriptor.
- *
- * @param[in]   filter  hppl filter descriptor.
- *
- */
-extern void hl_destroy_filter_descriptor(hl_filter_descriptor filter);
-
-/**
- * @brief   create convolution descriptor.
- *
- * @param[out]  conv                    conv descriptor.
- * @param[in]   image                   input image descriptor.
- * @param[in]   filter                  filter descriptor.
- * @param[in]   padding_height          padding height.
- * @param[in]   padding_width           padding width.
- * @param[in]   stride_height           stride height.
- * @param[in]   stride_width            stride width.
- *
- */
-extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-                                             hl_tensor_descriptor image,
-                                             hl_filter_descriptor filter,
-                                             int padding_height,
-                                             int padding_width,
-                                             int stride_height,
-                                             int stride_width,
-                                             int dilation_h = 1,
-                                             int dilation_w = 1);
-
-/**
- * @brief   reset convolution descriptor.
- *
- * @param[in,out]   conv                conv descriptor.
- * @param[in]       image               input image descriptor.
- * @param[in]       filter              filter descriptor.
- * @param[in]       padding_height      padding height.
- * @param[in]       padding_width       padding width.
- * @param[in]       stride_height       stride height.
- * @param[in]       stride_width        stride width.
- *
- */
-extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-                                            hl_tensor_descriptor image,
-                                            hl_filter_descriptor filter,
-                                            int padding_height,
-                                            int padding_width,
-                                            int stride_height,
-                                            int stride_width,
-                                            int dilation_h = 1,
-                                            int dilation_w = 1);
-
-/**
- * @brief   destroy convolution descriptor.
- *
- * @param[in]   conv  hppl convolution descriptor.
- */
-extern void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv);
-
-/**
- * @brief   convolution forward(calculate output image).
- *
- * @param[in]   input           input image descriptor.
- * @param[in]   input_data      input image data.
- * @param[in]   output          output image descriptor.
- * @param[out]  output_data     output image data.
- * @param[in]   filter          filter descriptor.
- * @param[in]   filter_data     filter data.
- * @param[in]   conv            convolution descriptor.
- * @param[in]   gpuWorkSpace    limited gpu workspace.
- * @param[in]   sizeInBytes     gpu workspace size (bytes).
- * @param[in]   convFwdAlgo     forward algorithm.
- */
-extern void hl_convolution_forward(hl_tensor_descriptor input,
-                                   real* input_data,
-                                   hl_tensor_descriptor output,
-                                   real* output_data,
-                                   hl_filter_descriptor filter,
-                                   real* filter_data,
-                                   hl_convolution_descriptor conv,
-                                   void* gpuWorkSpace,
-                                   size_t sizeInBytes,
-                                   int convFwdAlgo);
-
-/**
- * @brief   convolution forward add bias(calculate output add bias).
- *
- * @param[in]   bias                bias descriptor.
- * @param[in]   bias_data           bias data.
- * @param[in]   output              output image descriptor.
- * @param[out]  output_data         output image data.
- */
-extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-                                            real* bias_data,
-                                            hl_tensor_descriptor output,
-                                            real* output_data);
-
-/**
- * @brief   convolution backward filter(calculate filter grad data).
- *
- * @param[in]   input               input image descriptor.
- * @param[in]   input_data          input image data.
- * @param[in]   output              output image descriptor.
- * @param[in]   output_grad_data    output image grad data.
- * @param[in]   filter              filter descriptor.
- * @param[out]  filter_grad_data    filter grad data.
- * @param[in]   conv                convolution descriptor.
- * @param[in]   gpuWorkSpace        limited gpu workspace.
- * @param[in]   sizeInBytes         gpu workspace size (bytes).
- * @param[in]   convBwdFilterAlgo   backward filter algorithm.
- */
-extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
-                                           real* input_data,
-                                           hl_tensor_descriptor output,
-                                           real* output_grad_data,
-                                           hl_filter_descriptor filter,
-                                           real* filter_grad_data,
-                                           hl_convolution_descriptor conv,
-                                           void* gpuWorkSpace,
-                                           size_t sizeInBytes,
-                                           int convBwdFilterAlgo);
-
-/**
- * @brief   convolution backward data(calculate input image grad data).
- *
- * @param[in]   input               input image descriptor.
- * @param[out]  input_data_grad     input image grad data.
- * @param[in]   output              output image descriptor.
- * @param[in]   output_grad_data    output image grad data.
- * @param[in]   filter              filter descriptor.
- * @param[in]   filter_data         filter data.
- * @param[in]   conv                convolution descriptor.
- * @param[in]   gpuWorkSpace        limited gpu workspace.
- * @param[in]   sizeInBytes         gpu workspace size (bytes).
- * @param[in]   convBwdDataAlgo     backward data algorithm.
- */
-extern void hl_convolution_backward_data(hl_tensor_descriptor input,
-                                         real* input_data_grad,
-                                         hl_tensor_descriptor output,
-                                         real* output_grad_data,
-                                         hl_filter_descriptor filter,
-                                         real* filter_data,
-                                         hl_convolution_descriptor conv,
-                                         void* gpuWorkSpace,
-                                         size_t sizeInBytes,
-                                         int convBwdDataAlgo);
-
-/**
- * @brief   convolution backward bias(calculate bias grad data).
- *
- * @param[in]   bias                bias descriptor.
- * @param[out]  bias_grad_data      bias grad data.
- * @param[in]   output              output image descriptor.
- * @param[in]   output_grad_data    output image grad data.
- */
-extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                         real* bias_grad_data,
-                                         hl_tensor_descriptor output,
-                                         real* output_grad_data);
-
-/**
- * @brief   softmax forward.
- *
- * @param[in]   input               input value.
- * @param[out]  output              output value.
- * @param[in]   height              matrix height.
- * @param[in]   width               matrix width.
- */
-extern void hl_softmax_forward(real* input,
-                               real* output,
-                               int height,
-                               int width);
-
-/**
- * @brief   softmax backward.
- *
- * @param[in]   output_value        output value data.
- * @param[out]  output_grad         output grad data.
- * @param[in]   height              matrix height.
- * @param[in]   width               matrix width.
- */
-extern void hl_softmax_backward(real* output_value,
-                                real* output_grad,
-                                int height,
-                                int width);
-
-/**
- * @brief   cudnn batch norm forward.
- *
- * @param[in]   inputDesc     input tensor descriptor desc.
- * @param[in]   input         input data.
- * @param[in]   outputDesc    output tensor descriptor desc.
- * @param[out]  output        output data.
- * @param[in]   bnParamDesc   tensor descriptor desc.
- *                            bnScale, bnBias, running mean/var, save_mean/var.
- * @param[in]   scale         batch normalization scale parameter (in original
- *                            paper scale is referred to as gamma).
- * @param[in]   bias          batch normalization bias parameter (in original
- *                            paper scale is referred to as beta).
- * @param[in]   factor        Factor used in the moving average computation.
- *                            runningMean = newMean * factor
- *                                         + runningMean * (1 - factor)
- * @param[in]   runningMean   running mean.
- * @param[in]   runningInvVar running variance.
- * @param[in]   epsilon       Epsilon value used in the batch normalization
- *                            formula.
- * @param[out]  savedMean     optional cache to save intermediate results.
- * @param[out]  savedVar      optional cache to save intermediate results.
- *
- */
-extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real* input,
-                                           hl_tensor_descriptor outputDesc,
-                                           real* output,
-                                           hl_tensor_descriptor bnParamDesc,
-                                           real* scale,
-                                           real* bias,
-                                           double factor,
-                                           real* runningMean,
-                                           real* runningInvVar,
-                                           double epsilon,
-                                           real* savedMean,
-                                           real* savedVar);
-
-/**
- * @brief   cudnn batch norm forward.
- *
- * @param[in]   inputDesc    input tensor descriptor desc.
- * @param[in]   input        input data.
- * @param[in]   outputDesc   output tensor descriptor desc.
- * @param[out]  output       output data.
- * @param[in]   bnParamDesc  tensor descriptor desc.
- *                           bnScale, bnBias, running mean/var, save_mean/var.
- * @param[in]   scale        batch normalization scale parameter (in original
- *                           paper scale is referred to as gamma).
- * @param[in]   bias         batch normalization bias parameter (in original
- *                           paper scale is referred to as beta).
- * @param[in]   estimatedMean
- * @param[in]   estimatedVar It is suggested that resultRunningMean,
- *                           resultRunningVariance from the
- *                           cudnnBatchNormalizationForwardTraining call
- *                           accumulated during the training phase are passed
- *                           as inputs here.
- * @param[in]   epsilon      Epsilon value used in the batch
- *                           normalization formula.
- *
- */
-extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real* input,
-                                            hl_tensor_descriptor outputDesc,
-                                            real* output,
-                                            hl_tensor_descriptor bnParamDesc,
-                                            real* scale,
-                                            real* bias,
-                                            real* estimatedMean,
-                                            real* estimatedVar,
-                                            double epsilon);
-
-/**
- * @brief   cudnn batch norm forward.
- *
- * @param[in]   inputDesc       input tensor descriptor desc.
- * @param[in]   input           input data.
- * @param[in]   outGradDesc     output tensor descriptor desc.
- * @param[out]  outGrad         output data.
- * @param[in]   inGradDesc      input tensor descriptor desc.
- * @param[in]   inGrad          input data.
- * @param[in]   dBnParamDesc    tensor descriptor desc.
- *                              bnScale, bnBias, running mean/var,
- * save_mean/var.
- * @param[in]   scale           batch normalization scale parameter (in original
- *                              paper scale is referred to as gamma).
- * @param[in]   scaleGrad       batch normalization scale parameter (in original
- *                              paper scale is referred to as gamma) gradient.
- * @param[in]   biasGrad        batch normalization bias parameter (in original
- *                              paper scale is referred to as beta) gradient.
- * @param[in]   epsilon         Epsilon value used in the batch
- *                              normalization formula.
- * @param[out]  savedMean       optional cache to save intermediate results.
- * @param[out]  savedInvVar     optional cache to save intermediate results.
- *
- */
-extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real* input,
-                                   hl_tensor_descriptor outGradDesc,
-                                   real* outGrad,
-                                   hl_tensor_descriptor inGradDesc,
-                                   real* inGrad,
-                                   hl_tensor_descriptor dBnParamDesc,
-                                   real* scale,
-                                   real* scaleGrad,
-                                   real* biasGrad,
-                                   double epsilon,
-                                   real* savedMean,
-                                   real* savedInvVar);
-
-#endif  // HL_CUDA_CUDNN_H_
diff --git a/paddle/legacy/cuda/include/hl_cuda_cudnn.ph b/paddle/legacy/cuda/include/hl_cuda_cudnn.ph
deleted file mode 100644
index bb3b89f6f..000000000
--- a/paddle/legacy/cuda/include/hl_cuda_cudnn.ph
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUDNN_PH_
-#define HL_CUDA_CUDNN_PH_
-
-#include "hl_base.h"
-
-/*
- * @brief   hppl for cudnn tensor4d descriptor.
- */
-typedef struct {
-    cudnnTensorDescriptor_t     desc;
-    cudnnTensorFormat_t         format;
-    cudnnDataType_t             data_type;  // image data type
-    int batch_size;                         // number of input batch size
-    int feature_maps;                       // number of input feature maps
-    int height;                             // height of input image
-    int width;                              // width of input image
-} _cudnn_tensor_descriptor, *cudnn_tensor_descriptor;
-
-#define GET_TENSOR_DESCRIPTOR(image) (((cudnn_tensor_descriptor)image)->desc)
-
-/*
- * @brief   hppl for cudnn pooling descriptor.
- */
-typedef struct {
-    cudnnPoolingDescriptor_t   desc;
-    cudnnPoolingMode_t         mode;
-    int window_height;
-    int window_width;
-    int stride_height;
-    int stride_width;
-} _cudnn_pooling_descriptor, *cudnn_pooling_descriptor;
-
-/*
- * @brief   hppl for cudnn filter descriptor.
- */
-typedef struct {
-    cudnnFilterDescriptor_t   desc;
-    cudnnDataType_t           data_type;    /* data type */
-    int output_feature_maps;        /* number of output feature maps */
-    int input_feature_maps;         /* number of input feature maps */
-    int filter_height;              /* height of each input filter */
-    int filter_width;               /* width of  each input fitler */
-} _cudnn_filter_descriptor, *cudnn_filter_descriptor;
-
-#define GET_FILTER_DESCRIPTOR(filter) (((cudnn_filter_descriptor)filter)->desc)
-
-/*
- * @brief   hppl for cudnn convolution descriptor.
- */
-typedef struct {
-    cudnnConvolutionDescriptor_t    desc;
-    hl_tensor_descriptor             input_image;
-    hl_filter_descriptor            filter;
-    int padding_height;                     // zero-padding height
-    int padding_width;                      // zero-padding width
-    int stride_height;                      // vertical filter stride
-    int stride_width;                       // horizontal filter stride
-    int upscalex;                           // upscale the input in x-direction
-    int upscaley;                           // upscale the input in y-direction
-    cudnnConvolutionMode_t          mode;
-} _cudnn_convolution_descriptor, *cudnn_convolution_descriptor;
-
-#define GET_CONVOLUTION_DESCRIPTOR(conv)    \
-    (((cudnn_convolution_descriptor)conv)->desc)
-
-#endif /* HL_CUDA_CUDNN_PH_ */
diff --git a/paddle/legacy/cuda/include/hl_device_functions.cuh b/paddle/legacy/cuda/include/hl_device_functions.cuh
deleted file mode 100755
index ef068e106..000000000
--- a/paddle/legacy/cuda/include/hl_device_functions.cuh
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_DEVICE_FUNCTIONS_CUH_
-#define HL_DEVICE_FUNCTIONS_CUH_
-
-namespace paddle {
-
-template <class T>
-inline __device__ T paddleAtomicAdd(T* address, T val);
-
-template <>
-inline __device__ float paddleAtomicAdd(float* address, float val) {
-  return atomicAdd(address, val);
-}
-
-template <>
-inline __device__ double paddleAtomicAdd(double* address, double val) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
-  return atomicAdd(address, val);
-#else
-  // NOLINTNEXTLINE
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old = *address_as_ull, assumed; // NOLINT
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull,
-                    assumed,
-                    __double_as_longlong(val +
-                    __longlong_as_double(assumed)));
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-#endif
-}
-}  // namespace paddle
-
-/**
- * @brief  sum reduction
- *
- * @param[in,out]  smem       input data, better to use __shared__ memory.
- * @param[in]      tid        thread index.
- * @param[in]      threads    the total thread number used to reduce,
- *                            such as, blockDim.x.
- *
- * @return smem[0]: the sum of each elements in smem.
- */
-__device__ __forceinline__
-void simpleReduce(real* smem, int tid, int threads) {
-  for (unsigned int s = threads / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      smem[tid] += smem[tid + s];
-    }
-    __syncthreads();
-  }
-}
-
-#endif /* HL_DEVICE_FUNCTIONS_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_functions.h b/paddle/legacy/cuda/include/hl_functions.h
deleted file mode 100644
index 9912b4c17..000000000
--- a/paddle/legacy/cuda/include/hl_functions.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_FUNCTIONS_H_
-#define HL_FUNCTIONS_H_
-
-#include "hl_base.h"
-
-/**
- * sigmoid threshold maximum
- */
-#define SIGMOID_THRESHOLD_MIN -40.0
-
-/**
- * sigmoid threshold minimum
- */
-#define SIGMOID_THRESHOLD_MAX 13.0
-
-#ifndef __NVCC__
-namespace hppl {
-/*
- * forward activation
- */
-real relu(const real a);
-real sigmoid(const real a);
-real tanh(const real a);
-real linear(const real a);
-
-/*
- * backward activation
- */
-real relu(const real a, const real b);
-real sigmoid(const real a, const real b);
-real tanh(const real a, const real b);
-real linear(const real a, const real b);
-}  // namespace hppl
-
-#ifdef __AVX__
-#include "hl_avx_functions.h"
-#endif
-
-#else
-#include "hl_gpu_functions.cuh"
-#endif
-
-#endif  // HL_FUNCTIONS_H_
diff --git a/paddle/legacy/cuda/include/hl_gpu.h b/paddle/legacy/cuda/include/hl_gpu.h
deleted file mode 100644
index 50a2e9cdd..000000000
--- a/paddle/legacy/cuda/include/hl_gpu.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_GPU_H_
-#define HL_GPU_H_
-
-#include "hl_aggregate.h"
-#include "hl_base.h"
-#include "hl_cnn.h"
-#include "hl_cuda.h"
-#include "hl_cuda_cublas.h"
-#include "hl_cuda_cudnn.h"
-#include "hl_lstm.h"
-#include "hl_matrix.h"
-#include "hl_sequence.h"
-#include "hl_sparse.h"
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "hl_warpctc_wrap.h"
-#endif
-
-#ifdef HPPL_STUB_FUNC
-#include "stub/hl_aggregate_stub.h"
-#include "stub/hl_cnn_stub.h"
-#include "stub/hl_cuda_cublas_stub.h"
-#include "stub/hl_cuda_cudnn_stub.h"
-#include "stub/hl_cuda_stub.h"
-#include "stub/hl_lstm_stub.h"
-#include "stub/hl_matrix_stub.h"
-#include "stub/hl_sequence_stub.h"
-#include "stub/hl_sparse_stub.h"
-#endif
-
-#endif /* HL_GPU_H_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_functions.cuh b/paddle/legacy/cuda/include/hl_gpu_functions.cuh
deleted file mode 100644
index 705aa71f4..000000000
--- a/paddle/legacy/cuda/include/hl_gpu_functions.cuh
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_FUNCTIONS_CUH_
-#define HL_GPU_FUNCTIONS_CUH_
-
-#include "hl_base.h"
-
-namespace hppl {
-
-  __device__ static real relu(const real a) {
-    return a > 0.0f ? a : 0.0f;
-  }
-
-  __device__ static real sigmoid(const real a) {
-    const real min = SIGMOID_THRESHOLD_MIN;
-    const real max = SIGMOID_THRESHOLD_MAX;
-    real tmp = (a < min) ? min : ((a > max) ? max : a);
-#ifndef PADDLE_TYPE_DOUBLE
-    return __fdividef(1.0f, 1.0f + __expf(-tmp));
-#else
-    return 1.0 / (1.0 + exp(-tmp));
-#endif
-  }
-
-  __device__ static real tanh(const real a) {
-#ifndef PADDLE_TYPE_DOUBLE
-    return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f;
-#else
-    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
-#endif
-  }
-
-  __device__ static real linear(const real a) {
-    return a;
-  }
-
-  __device__ static real relu(const real a, const real b) {
-    return a * (b > 0.0f ? 1.0f : 0.0f);
-  }
-
-  __device__ static real sigmoid(const real a, const real b) {
-    return a * b * (1 - b);
-  }
-
-  __device__ static real tanh(const real a, const real b) {
-    return a * (1.0f - b * b);
-  }
-
-  __device__ static real linear(const real a, const real b) {
-    return a;
-  }
-
-}  // namespace hppl
-
-#endif  // HL_GPU_FUNCTIONS_CUH_
diff --git a/paddle/legacy/cuda/include/hl_gpu_gru.cuh b/paddle/legacy/cuda/include/hl_gpu_gru.cuh
deleted file mode 100644
index 8d299572c..000000000
--- a/paddle/legacy/cuda/include/hl_gpu_gru.cuh
+++ /dev/null
@@ -1,393 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_GRU_CUH_
-#define HL_GPU_GRU_CUH_
-
-#ifdef __NVCC__
-
-#include "paddle/legacy/utils/Logging.h"
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpResetOutput, bool isBatch>
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
-                                        real *gateValue,
-                                        real *resetOutputValue,
-                                        real *prevOutputValue,
-                                        int frameSize,
-                                        int batchSize,
-                                        hl_activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    resetOutputValue += batchIdx * frameSize;
-  }
-
-  real rPrevOut = 0;
-  real rValueResetOutput;
-  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  real rValueResetGate  = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
-  }
-
-  opResetOutput(rValueUpdateGate,
-                rValueResetGate,
-                rPrevOut,
-                rValueResetOutput,
-                hppl::gpu::forward[active_gate]);
-
-  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
-  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
-  resetOutputValue[frameIdx] = rValueResetOutput;
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpFinalOutput, bool isBatch>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
-                                        real *gateValue,
-                                        real *prevOutputValue,
-                                        real *outputValue,
-                                        int frameSize,
-                                        int batchSize,
-                                        hl_activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    outputValue += batchIdx * frameSize;
-  }
-
-  real rOutput;
-  real rPrevOut = 0;
-  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  real rValueFrameState = gateValue[frameIdx + frameSize * 2];
-
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
-  }
-
-  opFinalOutput(rValueUpdateGate,
-                rValueFrameState,
-                rPrevOut,
-                rOutput,
-                hppl::gpu::forward[active_node]);
-
-  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
-  outputValue[frameIdx] = rOutput;
-}
-
-template<class OpResetOutput, class OpFinalOutput>
-void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (value.prevOutValue) {
-    hl_matrix_mul(value.prevOutValue, HPPL_OP_N,
-                  value.gateWeight, HPPL_OP_N,
-                  value.gateValue,
-                  batchSize, 2*frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  frameSize, 2* frameSize, 3*frameSize);
-  }
-
-  if (batchSize == 1) {
-    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, batchSize, active_gate);
-  } else {
-    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, batchSize, active_gate);
-  }
-
-  if (value.prevOutValue) {
-    hl_matrix_mul(value.resetOutputValue, HPPL_OP_N,
-                  value.stateWeight, HPPL_OP_N,
-                  value.gateValue + 2*frameSize,
-                  batchSize, frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  frameSize, frameSize, 3*frameSize);
-  }
-
-  if (batchSize == 1) {
-    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, batchSize, active_node);
-  } else {
-    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, batchSize, active_node);
-  }
-
-  CHECK_SYNC("hl_gpu_gru_forward failed");
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpStateGrad, bool isBatch>
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad,
-                                       real *gateValue,
-                                       real *gateGrad,
-                                       real *prevOutValue,
-                                       real *prevOutGrad,
-                                       real *outputGrad,
-                                       int frameSize,
-                                       int batchSize,
-                                       hl_activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad  += batchIdx * 3 * frameSize;
-    outputGrad += batchIdx * frameSize;
-  }
-
-  real rUpdateGateGrad;
-  real rFrameStateGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  real rFrameStateValue = gateValue[frameIdx + frameSize * 2];
-  real rOutGrad  = outputGrad[frameIdx];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-
-    if (isBatch) prevOutGrad  += batchIdx * frameSize;
-    rPrevOutGrad  = prevOutGrad[frameIdx];
-  }
-
-  opStateGrad(rUpdateGateValue,
-              rUpdateGateGrad,
-              rFrameStateValue,
-              rFrameStateGrad,
-              rPrevOutValue,
-              rPrevOutGrad,
-              rOutGrad,
-              hppl::gpu::backward[active_node]);
-
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
-  }
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpResetGrad, bool isBatch>
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad,
-                                       real *gateValue,
-                                       real *gateGrad,
-                                       real *prevOutValue,
-                                       real *prevOutGrad,
-                                       real *resetOutputGrad,
-                                       int frameSize,
-                                       int batchSize,
-                                       hl_activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad  += batchIdx * 3 * frameSize;
-    resetOutputGrad += batchIdx * frameSize;
-  }
-
-  real rResetGateGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real rResetOutputGrad = 0;
-  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  real rUpdateGateGrad  = gateGrad[frameIdx + frameSize * 0];
-  real rResetGateValue  = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    if (isBatch) prevOutGrad  += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-    rPrevOutGrad  = prevOutGrad[frameIdx];
-    rResetOutputGrad = resetOutputGrad[frameIdx];
-  }
-
-  opResetGrad(rUpdateGateValue,
-              rUpdateGateGrad,
-              rResetGateValue,
-              rResetGateGrad,
-              rPrevOutValue,
-              rPrevOutGrad,
-              rResetOutputGrad,
-              hppl::gpu::backward[active_gate]);
-
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
-  }
-}
-
-template<class OpStateGrad, class OpResetGrad>
-void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, batchSize, active_node);
-  } else {
-    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, batchSize, active_node);
-  }
-
-  if (value.prevOutValue && grad.prevOutGrad) {
-    hl_matrix_mul(grad.gateGrad + 2*frameSize, HPPL_OP_N,
-                  value.stateWeight, HPPL_OP_T,
-                  grad.resetOutputGrad,
-                  batchSize, frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 0,
-                  3*frameSize, frameSize, frameSize);
-    if (grad.stateWeightGrad) {
-      hl_matrix_mul(value.resetOutputValue, HPPL_OP_T,
-                    grad.gateGrad + 2*frameSize, HPPL_OP_N,
-                    grad.stateWeightGrad,
-                    frameSize, frameSize, batchSize,
-                    /*alpha = */ 1, /*beta = */ 1,
-                    frameSize, 3*frameSize, frameSize);
-    }
-  }
-
-  if (batchSize == 1) {
-    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, batchSize, active_gate);
-  } else {
-    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, batchSize, active_gate);
-  }
-
-  if (grad.prevOutGrad && value.prevOutValue) {
-    hl_matrix_mul(grad.gateGrad, HPPL_OP_N,
-                  value.gateWeight, HPPL_OP_T,
-                  grad.prevOutGrad,
-                  batchSize, frameSize, 2*frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  3*frameSize, 2*frameSize, frameSize);
-    if (grad.gateWeightGrad) {
-      hl_matrix_mul(value.prevOutValue, HPPL_OP_T,
-                    grad.gateGrad, HPPL_OP_N,
-                    grad.gateWeightGrad,
-                    frameSize, 2*frameSize, batchSize,
-                    /*alpha = */ 1, /*beta = */ 1,
-                    frameSize, 3*frameSize, 2*frameSize);
-    }
-  }
-
-  CHECK_SYNC("hl_gpu_gru_backward failed");
-}
-
-#else
-
-template<class OpResetOutput, class OpFinalOutput>
-void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {}
-
-template<class OpStateGrad, class OpResetGrad>
-void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {}
-
-#endif
-
-#endif /* HL_GPU_GRU_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_lstm.cuh b/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
deleted file mode 100644
index aae011b83..000000000
--- a/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_LSTM_CUH_
-#define HL_GPU_LSTM_CUH_
-
-#ifdef __NVCC__
-
-#include "paddle/legacy/utils/Logging.h"
-#include "hl_device_functions.cuh"
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class Op, bool isBatch>
-__global__ void KeLstmForward(Op op,
-                              hl_lstm_value value,
-                              int frameSize,
-                              int batchSize,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.outputValue += batchIdx * frameSize;
-    value.stateValue  += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-  }
-
-  real rState;
-  real rPrevState = 0;
-  real rStateAtv;
-  real rOut;
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rCheckI = value.checkIg[frameIdx];
-  real rCheckF = value.checkFg[frameIdx];
-  real rCheckO = value.checkOg[frameIdx];
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
-  }
-
-  op(rValueIn,
-     rValueIg,
-     rValueFg,
-     rValueOg,
-     rPrevState,
-     rState,
-     rStateAtv,
-     rOut,
-     rCheckI,
-     rCheckF,
-     rCheckO,
-     hppl::gpu::forward[active_node],
-     hppl::gpu::forward[active_gate],
-     hppl::gpu::forward[active_state]);
-
-  value.gateValue[frameIdx] = rValueIn;
-  value.gateValue[frameIdx + frameSize] = rValueIg;
-  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
-  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
-
-  value.stateValue[frameIdx] = rState;
-  value.stateActiveValue[frameIdx] = rStateAtv;
-  value.outputValue[frameIdx] = rOut;
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class Op, bool isBatch>
-__global__ void KeLstmBackward(Op op,
-                               hl_lstm_value value,
-                               hl_lstm_grad grad,
-                               int frameSize,
-                               int batchSize,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate,
-                               hl_activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.stateValue += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-    grad.gateGrad += batchIdx * frameSize * 4;
-    grad.stateGrad += batchIdx * frameSize;
-    grad.outputGrad += batchIdx * frameSize;
-  }
-
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rGradIn;
-  real rGradIg;
-  real rGradFg;
-  real rGradOg;
-  real rPrevState = 0;
-  real rPrevStateGrad;
-  real rState;
-  real rStateGrad;
-  real rStateAtv;
-  real rOutputGrad;
-  real rCheckI = value.checkIg[frameIdx];
-  real rCheckF = value.checkFg[frameIdx];
-  real rCheckO = value.checkOg[frameIdx];
-  real rCheckIGrad;
-  real rCheckFGrad;
-  real rCheckOGrad;
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-  rState = value.stateValue[frameIdx];
-  rStateAtv = value.stateActiveValue[frameIdx];
-  rOutputGrad = grad.outputGrad[frameIdx];
-  rStateGrad = grad.stateGrad[frameIdx];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
-  }
-
-  op(rValueIn,
-     rValueIg,
-     rValueFg,
-     rValueOg,
-     rGradIn,
-     rGradIg,
-     rGradFg,
-     rGradOg,
-     rPrevState,
-     rPrevStateGrad,
-     rState,
-     rStateGrad,
-     rStateAtv,
-     rOutputGrad,
-     rCheckI,
-     rCheckF,
-     rCheckO,
-     rCheckIGrad,
-     rCheckFGrad,
-     rCheckOGrad,
-     hppl::gpu::backward[active_node],
-     hppl::gpu::backward[active_gate],
-     hppl::gpu::backward[active_state]);
-
-  grad.gateGrad[frameIdx] = rGradIn;
-  grad.gateGrad[frameIdx + frameSize    ] = rGradIg;
-  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
-  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
-  grad.stateGrad[frameIdx] = rStateGrad;
-  if (grad.prevStateGrad) {
-    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
-    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
-  }
-
-  if (isBatch) {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
-      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
-    }
-    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
-  } else {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
-    }
-    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
-  }
-}
-
-template<class Op>
-void hl_gpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeLstmForward<Op, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  } else {
-    KeLstmForward<Op, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  }
-
-  CHECK_SYNC("hl_gpu_lstm_forward failed");
-}
-
-template<class Op>
-void hl_gpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          int batchSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeLstmBackward<Op, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  } else {
-    KeLstmBackward<Op, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  }
-
-  CHECK_SYNC("hl_gpu_lstm_backward failed");
-}
-
-#else
-
-template<class Op>
-void hl_gpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {}
-
-template<class Op>
-void hl_gpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          int batchSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {}
-
-#endif
-
-#endif /* HL_GPU_LSTM_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
deleted file mode 100644
index 6177d2365..000000000
--- a/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
+++ /dev/null
@@ -1,629 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-
-#ifndef HL_GPU_MATRIX_KERNEL_CUH_
-#define HL_GPU_MATRIX_KERNEL_CUH_
-
-#include <algorithm>
-#include "paddle/legacy/utils/Logging.h"
-#include "hl_base.h"
-
-#ifdef __NVCC__
-/* gpu apply interface */
-
-template<class T, class Op>
-__global__ void KeEltWiseUnaryOp(T* A_d, const int border, Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx]);
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseUnaryOp(T* A_d,
-                                 int dimM,
-                                 int dimN,
-                                 int lda,
-                                 Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      op.gpuOperator(A_d[i * lda + j]);
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseBinaryOp(T* A_d, T *B_d, const int border, Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx]);
-  }
-}
-
-template<class T, class Op, bool BAsRowVector, bool BAsColVector>
-__global__ void KeEltWiseBinaryOp(T *A_d,
-                                  T *B_d,
-                                  int dimM,
-                                  int dimN,
-                                  int lda,
-                                  int ldb,
-                                  Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      if (BAsRowVector == 0 && BAsColVector == 0) {
-        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb + j]);
-      } else if (BAsRowVector == 1 && BAsColVector == 0) {
-        op.gpuOperator(A_d[i * lda + j], B_d[j]);
-      } else if (BAsRowVector == 0 && BAsColVector == 1) {
-        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb]);
-      } else {
-        op.gpuOperator(A_d[i * lda + j], B_d[0]);
-      }
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseTernaryOp(T* A_d,
-                                   T *B_d,
-                                   T *C_d,
-                                   const int border,
-                                   Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx]);
-  }
-}
-
-template<class T, class Op, bool CAsRowVector, bool CAsColVector>
-__global__ void KeEltWiseTernaryOp(T* A_d,
-                                   T* B_d,
-                                   T* C_d,
-                                   int dimM,
-                                   int dimN,
-                                   int lda,
-                                   int ldb,
-                                   int ldc,
-                                   Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      if (CAsRowVector == 0 && CAsColVector == 0) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc + j]);
-      } else if (CAsRowVector == 1 && CAsColVector == 0) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[j]);
-      } else if (CAsRowVector == 0 && CAsColVector == 1) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc]);
-      } else {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[0]);
-      }
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseQuaternaryOp(T* A_d,
-                                      T* B_d,
-                                      T* C_d,
-                                      T* D_d,
-                                      const int border,
-                                      Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx], D_d[idx]);
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseQuaternaryOp(T* A_d,
-                                      T* B_d,
-                                      T* C_d,
-                                      T* D_d,
-                                      int dimM,
-                                      int dimN,
-                                      int lda,
-                                      int ldb,
-                                      int ldc,
-                                      int ldd,
-                                      Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      op.gpuOperator(A_d[i*lda + j],
-        B_d[i*ldb + j], C_d[i*ldc + j], D_d[i*ldd + j]);
-    }
-  }
-}
-
-/**
- * @brief   gpu element wise unary operator.
- */
-template <class T, class Op>
-void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {
-  CHECK_NOTNULL(A_d);
-
-  if (dimM == 1 || dimN == lda) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseUnaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseUnaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, dimM, dimN, lda, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_unary_op failed");
-}
-
-/**
- * @brief   gpu element wise binary operator.
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_gpu_apply_binary_op(Op op,
-                            T* A_d,
-                            T* B_d,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {
-  CHECK_NOTNULL(A_d);
-
-  if ((BAsRowVector == 0 && BAsColVector == 0) &&
-      ((dimM == 1) || (dimN == lda && dimN == ldb))) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseBinaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseBinaryOp<T, Op, BAsRowVector, BAsColVector>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, dimM, dimN, lda, ldb, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_binary_op failed");
-}
-
-/**
- * @brief   gpu element wise ternary operator.
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_gpu_apply_ternary_op(Op op,
-                             T* A_d,
-                             T* B_d,
-                             T* C_d,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {
-  CHECK_NOTNULL(A_d);
-
-  if ((CAsRowVector == 0 && CAsColVector == 0) &&
-      ((dimM == 1) || (dimN == lda && dimN == ldb && dimN == ldc))) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseTernaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseTernaryOp<T, Op, CAsRowVector, CAsColVector>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, dimM, dimN, lda, ldb, ldc, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_ternary_op failed");
-}
-
-
-/**
- * @brief   gpu element wise quaternary operator.
- */
-template <class T, class Op>
-void hl_gpu_apply_quaternary_op(Op op,
-                                T* A_d,
-                                T* B_d,
-                                T* C_d,
-                                T* D_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {
-  CHECK_NOTNULL(A_d);
-
-  if ((dimM == 1) ||
-      (dimN == lda && dimN == ldb && dimN == ldc && dimN == ldd)) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseQuaternaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, D_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseQuaternaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, D_d, dimM, dimN, lda, ldb, ldc, ldd, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_quaternary_op failed");
-}
-
-#else
-
-template <class T, class Op>
-void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {}
-
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_gpu_apply_binary_op(Op op,
-                            T* A_d,
-                            T* B_d,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {}
-
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_gpu_apply_ternary_op(Op op,
-                             T* A_d,
-                             T* B_d,
-                             T* C_d,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {}
-
-template <class T, class Op>
-void hl_gpu_apply_quaternary_op(Op op,
-                                T* A_d,
-                                T* B_d,
-                                T* C_d,
-                                T* D_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {}
-#endif
-
-#ifdef __NVCC__
-/**
- * @brief   matrix row operator.
- */
-
-template<class Agg, class Op>
-__device__ __inline__ real sumRow(Agg agg, Op op,
-                                  int idx, int blockSize,
-                                  int dimN, real *A) {
-  real tmp = agg.init();
-  int cnt = (dimN + blockSize -1) / blockSize;
-  for (int i = 0; i < cnt && idx < dimN; i++) {
-      tmp = agg(tmp, op(A[idx]));
-      idx += blockSize;
-  }
-  return tmp;
-}
-
-template<class Agg, class Op>
-__device__ __inline__ real sumRow(Agg agg, Op op,
-                                  int idx, int blockSize,
-                                  int dimN, real *A, real *B) {
-  real tmp = agg.init();
-  int cnt = (dimN + blockSize -1) / blockSize;
-  for (int i = 0; i < cnt && idx < dimN; i++) {
-    tmp = agg(tmp, op(A[idx], B[idx]));
-    idx += blockSize;
-  }
-  return tmp;
-}
-
-template<class Agg>
-__device__ __inline__ void aggRow(Agg agg, real *row, int size, int tid) {
-  for (int stride = size/2; stride > 0; stride = stride/2) {
-    if (tid < stride) {
-      row[tid] = agg(row[tid], row[tid + stride]);
-    }
-    __syncthreads();
-  }
-}
-
-template<class Agg, class Op, class Saver, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
-                              int dimN,
-                              real *dst, int ld,
-                              real *A, int lda) {
-  __shared__ real row_s[blockSize];
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int tid = threadIdx.x;
-
-  A += rowId*lda;
-  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A);
-  __syncthreads();
-
-  aggRow(agg, row_s, blockSize, tid);
-  __syncthreads();
-
-  if (tid == 0) {
-    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
-  }
-}
-
-template<class Agg, class Op, class Saver, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
-                              int dimN,
-                              real *dst, int ld,
-                              real *A, int lda,
-                              real *B, int ldb) {
-  __shared__ real row_s[blockSize];
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int tid = threadIdx.x;
-
-  A += rowId*lda;
-  B += rowId*ldb;
-  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A, B);
-  __syncthreads();
-
-  aggRow(agg, row_s, blockSize, tid);
-  __syncthreads();
-
-  if (tid == 0) {
-    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
-  }
-}
-
-/**
- * @brief   matrix column operator.
- */
-template <class Agg, class Op>
-__device__ __inline__ real sumCol(Agg agg, Op op,
-                                  int index, int stride,
-                                  int dimM, real *A, int lda) {
-  real tmp = agg.init();
-  for (; index < dimM;) {
-    tmp = agg(tmp, op(A[index*lda]));
-    index += stride;
-  }
-  return tmp;
-}
-
-template <class Agg, class Op>
-__device__ __inline__ real sumCol(Agg agg, Op op,
-                                  int index, int stride, int dimM,
-                                  real *A, int lda, real *B, int ldb) {
-  real tmp = agg.init();
-  for (; index < dimM;) {
-    tmp = agg(tmp, op(A[index*lda], B[index*ldb]));
-    index += stride;
-  }
-  return tmp;
-}
-
-template <class Agg, class Op, class Saver>
-__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst,
-                                 real *A, int lda) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda);
-    dst[rowIdx] = sv(dst[rowIdx], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
-                                   int dimM, int dimN,
-                                   real *dst,
-                                   real *A, int lda) {
-  __shared__ real col_s[blockDimX*blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    real tmp = sumCol(agg, op, threadIdx.y, blockDimY, dimM, A, lda);
-    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
-  }
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
-      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
-      }
-      dst[rowIdx] = sv(dst[rowIdx], tmp);
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst,
-                                 real *A, int lda,
-                                 real *B, int ldb) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    B += rowIdx;
-    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda, B, ldb);
-    dst[rowIdx] = sv(dst[rowIdx], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
-                                   int dimM, int dimN,
-                                   real *dst,
-                                   real *A, int lda,
-                                   real *B, int ldb) {
-  __shared__ real col_s[blockDimX*blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    B += rowIdx;
-    real tmp = sumCol(agg, op,
-        threadIdx.y, blockDimY, dimM, A, lda, B, ldb);
-    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
-  }
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
-      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
-      }
-      dst[rowIdx] = sv(dst[rowIdx], tmp);
-    }
-  }
-}
-
-#endif
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-#ifdef __NVCC__
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(A);
-
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (agg, op, sv, dimN, dst, ld, A, lda);
-
-  CHECK_SYNC("hl_matrix_row_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-#ifdef __NVCC__
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(A);
-
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (agg, op, sv, dimN, dst, ld, A, lda, B, ldb);
-
-  CHECK_SYNC("hl_matrix_row_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-#ifdef __NVCC__
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg, Op, Saver>
-        <<< grid, threads, 0, STREAM_DEFAULT >>>
-        (agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    int blocksX = (dimN + 32 -1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
-        <<< grid, threads, 0, STREAM_DEFAULT>>>
-        (agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-
-  CHECK_SYNC("hl_matrix_column_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-#ifdef __NVCC__
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg, Op, Saver>
-        <<< grid, threads, 0, STREAM_DEFAULT >>>
-        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    int blocksX = (dimN + 32 -1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
-        <<< grid, threads, 0, STREAM_DEFAULT>>>
-        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-
-  CHECK_SYNC("hl_matrix_column_op failed");
-#endif
-}
-
-#endif /* HL_GPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gru_ops.cuh b/paddle/legacy/cuda/include/hl_gru_ops.cuh
deleted file mode 100644
index 6c647c514..000000000
--- a/paddle/legacy/cuda/include/hl_gru_ops.cuh
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GRU_OPS_CUH_
-#define HL_GRU_OPS_CUH_
-
-#ifdef __CUDA_ARCH__
-#define INLINE   __device__ inline
-#else
-#define INLINE   inline
-#endif
-
-namespace hppl {
-
-namespace forward {
-class gru_resetOutput {
-public:
-  /**
-   * @param[in,out]   valueUpdateGate  update gate
-   * @param[in,out]   valueResetGate   reset gate
-   * @param[in]       prevOut          previous output
-   * @param[out]      valueResetOutput intermediate value for frame state
-   * @param[in]       actGate          forward function of gate
-   */
-  INLINE void operator()(real &valueUpdateGate,
-                         real &valueResetGate,
-                         real &prevOut,
-                         real &valueResetOutput,
-                         Active<real>::forward actGate) {
-    valueUpdateGate  = actGate(valueUpdateGate);
-    valueResetGate   = actGate(valueResetGate);
-    valueResetOutput = prevOut * valueResetGate;
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueUpdateGate,
-                         __m256 &valueResetGate,
-                         __m256 &prevOut,
-                         __m256 &valueResetOutput,
-                         Active<__m256>::forward actGate) {
-    valueUpdateGate  = actGate(valueUpdateGate);
-    valueResetGate   = actGate(valueResetGate);
-    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
-  }
-#endif
-#endif
-};
-
-class gru_finalOutput {
-public:
-  /**
-   * @param[in]     valueUpdateGate   update gate
-   * @param[in,out] valueFrameState   frame state ({\tilde{h}_t})
-   * @param[in]     prevOut           previous output
-   * @param[out]    valueOutput       output
-   * @param[in]     actInput          forward function of node
-   */
-  INLINE void operator()(real &valueUpdateGate,
-                         real &valueFrameState,
-                         real &prevOut,
-                         real &valueOutput,
-                         Active<real>::forward actInput ) {
-    valueFrameState = actInput(valueFrameState);
-    valueOutput = prevOut - (valueUpdateGate * prevOut) +
-      (valueUpdateGate * valueFrameState);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueUpdateGate,
-                         __m256 &valueFrameState,
-                         __m256 &prevOut,
-                         __m256 &valueOutput,
-                         Active<__m256>::forward actInput) {
-    valueFrameState = actInput(valueFrameState);
-    valueOutput = _mm256_add_ps(
-      _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
-      _mm256_mul_ps(valueUpdateGate, valueFrameState));
-  }
-#endif
-#endif
-};
-}  // namespace forward
-
-namespace backward {
-class gru_stateGrad {
-public:
-  /**
-   * @param[in]     valueUpdateGate   update gate value
-   * @param[out]    gradUpdateGate    update gate grad
-   * @param[in]     valueFrameState   frame state value
-   * @param[out]    gradFrameState    frame state grad
-   * @param[in]     valuePrevOut      previous output value
-   * @param[in,out] gradPrevOut       previous output grad
-   * @param[in]     gradOutput        output grad
-   * @param[in]     actInput          backward function of frame state
-   */
-  INLINE void operator()(real &valueUpdateGate,
-                         real &gradUpdateGate,
-                         real &valueFrameState,
-                         real &gradFrameState,
-                         real &valuePrevOut,
-                         real &gradPrevOut,
-                         real &gradOutput,
-                         Active<real>::backward actInput) {
-    gradUpdateGate = (gradOutput * valueFrameState);
-    gradUpdateGate -= (gradOutput * valuePrevOut);
-    gradPrevOut -= (gradOutput * valueUpdateGate);
-    gradPrevOut += gradOutput;
-    gradFrameState = actInput(gradOutput * valueUpdateGate, valueFrameState);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueUpdateGate,
-                         __m256 &gradUpdateGate,
-                         __m256 &valueFrameState,
-                         __m256 &gradFrameState,
-                         __m256 &valuePrevOut,
-                         __m256 &gradPrevOut,
-                         __m256 &gradOutput,
-                         Active<__m256>::backward actInput) {
-    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
-    gradUpdateGate = _mm256_sub_ps(
-      gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
-    gradPrevOut = _mm256_add_ps(
-      _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
-      gradOutput);
-    gradFrameState = actInput(
-      _mm256_mul_ps(gradOutput, valueUpdateGate), valueFrameState);
-  }
-#endif
-#endif
-};
-
-class gru_resetGrad {
-public:
-  /**
-   * @param[in]     valueUpdateGate   update gate value
-   * @param[in,out] gradUpdateGate    update gate grad
-   * @param[in]     valueResetGate    reset gate value
-   * @param[out]    gradResetGate     reset gate grad
-   * @param[in]     valuePrevOut      previous output value
-   * @param[in,out] gradPrevOut       previous output grad
-   * @param[in]     gradResetOutput   reset output grad (temp val)
-   * @param[in]     actGate           backward function of gate
-   */
-  INLINE void operator()(real &valueUpdateGate,
-                         real &gradUpdateGate,
-                         real &valueResetGate,
-                         real &gradResetGate,
-                         real &valuePrevOut,
-                         real &gradPrevOut,
-                         real &gradResetOutput,
-                         Active<real>::backward actGate) {
-    gradResetGate = (gradResetOutput * valuePrevOut);
-    gradPrevOut += (gradResetOutput * valueResetGate);
-    gradUpdateGate = actGate(gradUpdateGate, valueUpdateGate);
-    gradResetGate  = actGate(gradResetGate , valueResetGate);
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueUpdateGate,
-                         __m256 &gradUpdateGate,
-                         __m256 &valueResetGate,
-                         __m256 &gradResetGate,
-                         __m256 &valuePrevOut,
-                         __m256 &gradPrevOut,
-                         __m256 &gradResetOutput,
-                         Active<__m256>::backward actGate) {
-    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
-    gradPrevOut = _mm256_add_ps(
-      gradPrevOut, _mm256_mul_ps(gradResetOutput, valueResetGate));
-    gradUpdateGate = actGate(gradUpdateGate, valueUpdateGate);
-    gradResetGate  = actGate(gradResetGate , valueResetGate);
-  }
-#endif
-#endif
-};
-}  // namespace backward
-}  // namespace hppl
-
-#endif /* HL_GRU_OPS_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_lstm.h b/paddle/legacy/cuda/include/hl_lstm.h
deleted file mode 100644
index 5db4783bf..000000000
--- a/paddle/legacy/cuda/include/hl_lstm.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_LSTM_H_
-#define HL_LSTM_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Lstm sequence parallel forward.
- *
- * @param[in]   gateValue           input value.
- * @param[out]  stateValue          state value.
- * @param[out]  preOutputValue     prev output value.
- * @param[out]  outputValue         output value.
- * @param[in]   checkIg             bias.
- * @param[in]   checkFg             bias.
- * @param[in]   checkOg             bias.
- * @param[in]   weight              weight.
- * @param[in]   sequence            sequence index.
- * @param[in]   frameSize           frame size.
- * @param[in]   numSequences        number of sequences.
- * @param[in]   reversed            reverse.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- *
- *
- * @note    Only support frameSize = 32 or 64.
- */
-extern void hl_lstm_parallel_forward(real *gateValue,
-                                     real *stateValue,
-                                     real *preOutputValue,
-                                     real *outputValue,
-                                     real *checkIg,
-                                     real *checkFg,
-                                     real *checkOg,
-                                     real *weight,
-                                     const int *sequence,
-                                     int frameSize,
-                                     int numSequences,
-                                     bool reversed,
-                                     hl_activation_mode_t active_node,
-                                     hl_activation_mode_t active_gate,
-                                     hl_activation_mode_t active_state);
-
-/**
- * @brief   Lstm sequence parallel backward data.
- *
- * @param[in]   gateValue           input value.
- * @param[out]  gateGrad            input gradient.
- * @param[in]   stateValue          state value.
- * @param[out]  stateGrad           state gradient.
- * @param[out]  preOutputValue     prev output value.
- * @param[out]  preOutputGrad      prev output gradient.
- * @param[in]   outputGrad          output gradient.
- * @param[in]   checkIg             bias.
- * @param[out]  checkIgGrad         bias gradient.
- * @param[in]   checkFg             bias.
- * @param[out]  checkFgGrad         bias gradient.
- * @param[in]   checkOg             bias.
- * @param[out]  checkOgGrad         bias gradient.
- * @param[in]   weight              weight.
- * @param[in]   sequence            sequence index.
- * @param[in]   frameSize           frame size.
- * @param[in]   numSequences        number of sequences.
- * @param[in]   reversed            reverse.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- *
- *
- * @note    Only support frameSize = 32 or 64.
- */
-extern void hl_lstm_parallel_backward_data(real *gateValue,
-                                           real *gateGrad,
-                                           real *stateValue,
-                                           real *stateGrad,
-                                           real *preOutputValue,
-                                           real *preOutputGrad,
-                                           real *outputGrad,
-                                           real *checkIg,
-                                           real *checkIgGrad,
-                                           real *checkFg,
-                                           real *checkFgGrad,
-                                           real *checkOg,
-                                           real *checkOgGrad,
-                                           real *weight,
-                                           const int *sequence,
-                                           int frameSize,
-                                           int numSequences,
-                                           bool reversed,
-                                           hl_activation_mode_t active_node,
-                                           hl_activation_mode_t active_gate,
-                                           hl_activation_mode_t active_state);
-
-/**
- * @brief   Lstm sequence parallel backward weight.
- *
- * @param[out]  weightGrad          weight gradient.
- * @param[in]   outputValue         output value.
- * @param[in]   gateGrad            gate gradient.
- * @param[in]   sequence            sequence index.
- * @param[in]   frameSize           frame size.
- * @param[in]   batchSize           batch size.
- * @param[in]   numSequences        number of sequences.
- * @param[in]   reversed            reverse.
- *
- */
-extern void hl_lstm_parallel_backward_weight(real *weightGrad,
-                                             real *outputValue,
-                                             real *gateGrad,
-                                             const int *sequence,
-                                             int frameSize,
-                                             int batchSize,
-                                             int numSequences,
-                                             bool reversed);
-
-#endif /* HL_LSTM_H_ */
diff --git a/paddle/legacy/cuda/include/hl_lstm_ops.cuh b/paddle/legacy/cuda/include/hl_lstm_ops.cuh
deleted file mode 100644
index 394fdf5ac..000000000
--- a/paddle/legacy/cuda/include/hl_lstm_ops.cuh
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_LSTM_OPS_CUH_
-#define HL_LSTM_OPS_CUH_
-
-#ifdef __CUDA_ARCH__
-#define INLINE   __device__ inline
-#else
-#define INLINE   inline
-#endif
-
-namespace hppl {
-
-namespace forward {
-class lstm {
-public:
-  /**
-   * @param   valueIn     input
-   * @param   valueIg     input gate
-   * @param   valueFg     forget gate
-   * @param   valueOg     output gate
-   * @param   prevState   previous state
-   * @param   state       current state
-   * @param   stateAtv    state active
-   * @param   output      output
-   * @param   checkI      check input gate
-   * @param   checkF      check forget gate
-   * @param   checkO      check output gate
-   * @param   actInput    forward function of input
-   * @param   actGate     forward function of gate
-   * @param   actState    forward function of state
-   */
-  INLINE void operator()(real &valueIn,
-                         real &valueIg,
-                         real &valueFg,
-                         real &valueOg,
-                         real &prevState,
-                         real &state,
-                         real &stateAtv,
-                         real &output,
-                         real &checkI,
-                         real &checkF,
-                         real &checkO,
-                         Active<real>::forward actInput,
-                         Active<real>::forward actGate,
-                         Active<real>::forward actState) {
-    valueIn = actInput(valueIn);
-    valueIg = actGate(valueIg + prevState * checkI);
-    valueFg = actGate(valueFg + prevState * checkF);
-    state = valueIn * valueIg + prevState * valueFg;
-    valueOg = actGate(valueOg + state * checkO);
-    stateAtv = actState(state);
-    output = valueOg * stateAtv;
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueIn,
-                         __m256 &valueIg,
-                         __m256 &valueFg,
-                         __m256 &valueOg,
-                         __m256 &prevState,
-                         __m256 &state,
-                         __m256 &stateAtv,
-                         __m256 &output,
-                         __m256 &checkI,
-                         __m256 &checkF,
-                         __m256 &checkO,
-                         Active<__m256>::forward actInput,
-                         Active<__m256>::forward actGate,
-                         Active<__m256>::forward actState) {
-    valueIn = actInput(valueIn);
-    valueIg = actGate(
-      _mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)));
-    valueFg = actGate(
-      _mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)));
-    state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg)
-        , _mm256_mul_ps(prevState, valueFg));
-    valueOg = actGate(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)));
-    stateAtv = actState(state);
-    output = _mm256_mul_ps(valueOg, stateAtv);
-  }
-#endif
-#endif
-};
-}  // namespace forward
-
-namespace backward {
-class lstm {
-public:
-  /**
-   * @param   valueIn         input
-   * @param   valueIg         input gate
-   * @param   valueFg         forget gate
-   * @param   valueOg         output gate
-   * @param   gradIn          input grad
-   * @param   gradIg          input gate grad
-   * @param   gradFg          forget gate grad
-   * @param   gradOg          output gate grad
-   * @param   prevState       previous state value
-   * @param   prevStateGrad   previous state grad
-   * @param   state           current state value
-   * @param   stateGrad       current state grad
-   * @param   stateAtv        state active
-   * @param   outputGrad      output grad
-   * @param   checkI          check input gate
-   * @param   checkF          check forget gate
-   * @param   checkO          check output gate
-   * @param   checkIGrad      check input gate grad
-   * @param   checkFGrad      check forget gate grad
-   * @param   checkOGrad      check output gate grad
-   * @param   actInput        backward function of input
-   * @param   actGate         backward function of gate
-   * @param   actState        backward function of state
-   */
-  INLINE void operator()(real &valueIn,
-                         real &valueIg,
-                         real &valueFg,
-                         real &valueOg,
-                         real &gradIn,
-                         real &gradIg,
-                         real &gradFg,
-                         real &gradOg,
-                         real &prevState,
-                         real &prevStateGrad,
-                         real &state,
-                         real &stateGrad,
-                         real &stateAtv,
-                         real &outputGrad,
-                         real &checkI,
-                         real &checkF,
-                         real &checkO,
-                         real &checkIGrad,
-                         real &checkFGrad,
-                         real &checkOGrad,
-                         Active<real>::backward actInput,
-                         Active<real>::backward actGate,
-                         Active<real>::backward actState) {
-    gradOg = actGate(outputGrad * stateAtv, valueOg);
-    stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO;
-    gradIn = actInput(stateGrad * valueIg, valueIn);
-    gradIg = actGate(stateGrad * valueIn, valueIg);
-    gradFg = actGate(stateGrad * prevState, valueFg);
-    prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
-    checkIGrad = gradIg * prevState;
-    checkFGrad = gradFg * prevState;
-    checkOGrad = gradOg * state;
-  }
-#ifndef __NVCC__
-#ifndef __AVX__
-  static const bool avx = false;
-#else
-  static const bool avx = true;
-  INLINE void operator()(__m256 &valueIn,
-                         __m256 &valueIg,
-                         __m256 &valueFg,
-                         __m256 &valueOg,
-                         __m256 &gradIn,
-                         __m256 &gradIg,
-                         __m256 &gradFg,
-                         __m256 &gradOg,
-                         __m256 &prevState,
-                         __m256 &prevStateGrad,
-                         __m256 &state,
-                         __m256 &stateGrad,
-                         __m256 &stateAtv,
-                         __m256 &outputGrad,
-                         __m256 &checkI,
-                         __m256 &checkF,
-                         __m256 &checkO,
-                         __m256 &checkIGrad,
-                         __m256 &checkFGrad,
-                         __m256 &checkOGrad,
-                         Active<__m256>::backward actInput,
-                         Active<__m256>::backward actGate,
-                         Active<__m256>::backward actState) {
-    gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg);
-    stateGrad = _mm256_add_ps(
-      actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad);
-    stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
-    gradIn = actInput(_mm256_mul_ps(stateGrad, valueIg), valueIn);
-    gradIg = actGate(_mm256_mul_ps(stateGrad, valueIn), valueIg);
-    gradFg = actGate(_mm256_mul_ps(stateGrad, prevState), valueFg);
-    prevStateGrad = _mm256_add_ps(
-      _mm256_mul_ps(gradIg, checkI), _mm256_mul_ps(gradFg, checkF));
-    prevStateGrad = _mm256_add_ps(
-      _mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
-    checkIGrad = _mm256_mul_ps(gradIg, prevState);
-    checkFGrad = _mm256_mul_ps(gradFg, prevState);
-    checkOGrad = _mm256_mul_ps(gradOg, state);
-  }
-#endif
-#endif
-};
-}  // namespace backward
-}  // namespace hppl
-
-#endif /* HL_LSTM_OPS_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix.h b/paddle/legacy/cuda/include/hl_matrix.h
deleted file mode 100644
index 88d538343..000000000
--- a/paddle/legacy/cuda/include/hl_matrix.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_MATRIX_H_
-#define HL_MATRIX_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Matrix addition: C_d[i] = alpha * A_d[i] + beta * B_d[i].
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   B_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (M x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- * @param[in]   alpha   scalar used for addition.
- * @param[in]   beta    scalar used for addition.
- *
- */
-extern void hl_matrix_add(
-    real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta);
-/**
- * @brief   Matrix Softmax.
- *
- * @param[in]   A_d     input maxtrix (M x N).
- * @param[out]  C_d     output matrix (M x N).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN);
-
-/**
- * @brief   Matrix softmax derivative.
- *
- * @param[out]  grad_d       intput matrix (M x N).
- * @param[in]   output_d     output matrix (M x N).
- * @param[in]   sftmaxSum_d  softmax sum (M * 1).
- * @param[in]   dimM         matrix height.
- * @param[in]   dimN         matrix width.
- *
- */
-extern void hl_matrix_softmax_derivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN);
-
-/**
- * @brief   Sequence softmax.
- *
- * @param[in]   A_d         input vector.
- * @param[out]  C_d         output vector.
- * @param[in]   index       start positions of sequence.
- * @param[in]   numSequence sequence number.
- *
- */
-extern void hl_sequence_softmax_forward(real* A_d,
-                                        real* C_d,
-                                        const int* index,
-                                        int numSequence);
-
-/**
- * @brief   Matrix cross entropy.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (M X 1).
- * @param[in]   label_d input matrix (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_cross_entropy(
-    real* A_d, real* C_d, int* label_d, int dimM, int dimN);
-
-/**
- * @brief   Matrix cross entropy back propagation.
- *
- * @param[out]  grad_d      output matrix (M x N).
- * @param[in]   output_d    input matrix (M x N).
- * @param[in]   label_d     input vector (M x 1).
- * @param[in]   dimM        matrix height.
- * @param[in]   dimN        matrix width.
- *
- */
-extern void hl_matrix_cross_entropy_bp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN);
-
-/**
- * @brief  Matrix multi-binary label cross entropy
- *
- * @param[in]   output    input matrix (M x N).
- * @param[out]  entropy   output matrix (M x 1).
- * @param[in]   mat       input sparse matrix.
- * @param[in]   dimM      matrix height.
- * @param[in]   dimN      matrix width.
- */
-extern void hl_matrix_multi_binary_cross_entropy(
-    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN);
-
-/**
- * @brief  Matrix multi-binary label cross entropy backprop
- *
- * @param[in]   output    input matrix (M x N).
- * @param[out]  grad      output matrix (M x N).
- * @param[in]   mat       input sparse matrix.
- * @param[in]   dimM      matrix height.
- * @param[in]   dimN      matrix width.
- */
-extern void hl_matrix_multi_binary_cross_entropy_bp(
-    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN);
-
-/**
- * @brief  Matrix zero memory.
- *
- * @param[in,out]  data   input data.
- * @param[in]      num    length of data.
- *
- */
-extern void hl_matrix_zero_mem(real* data, int num);
-
-/**
- * @brief parameter relu forward
- *
- * @param[out] output     output data
- * @param[in]  input      input data
- * @param[in]  w          parameter data
- * @param[in]  width      matrix width
- * @param[in]  height     matrix height
- * @param[in]  partial_sum
- */
-
-extern void hl_param_relu_forward(
-    real* output, real* input, real* w, int width, int height, int partial_sum);
-/**
- * @brief parameter relu backward w
- *
- * @param[out] grad_w      w grad
- * @param[in]  grad_o      output grad
- * @param[in]  input       input data
- * @param[in]  width       matrix width
- * @param[in]  height      matrix height
- * @param[in]  partial_sum
- */
-extern void hl_param_relu_backward_w(real* grad_w,
-                                     real* grad_o,
-                                     real* input,
-                                     int width,
-                                     int height,
-                                     int partial_sum);
-/**
- * @brief parameter relu backward diff
- *
- * @param[in]       grad_o      output grad
- * @param[in]       input       input data
- * @param[in]       w           parameter
- * @param[out]      diff        diff
- * @param[in]       width       matrix width
- * @param[in]       height      matrix height
- * @param[in]       partial_sum
- */
-extern void hl_param_relu_backward_diff(real* grad_o,
-                                        real* input,
-                                        real* w,
-                                        real* diff,
-                                        int width,
-                                        int height,
-                                        int partial_sum);
-
-/**
- * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   B_d     input matrix (1 x channel).
- * @param[in]   channel width of B.
- * @param[in]   dimM    height of A.
- * @param[in]   dimN    width of A.
- * @param[in]   scale   scalar used for addition.
- *
- */
-extern void hl_matrix_add_shared_bias(real* A_d,
-                                      real* B_d,
-                                      const int channel,
-                                      const int dimM,
-                                      const int dimN,
-                                      real scale);
-
-/**
- * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
- *
- * @param[in]   B_d     input matrix (1 x channel).
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   channel width of B.
- * @param[in]   dimM    height of A.
- * @param[in]   dimN    width of A.
- * @param[in]   scale   scalar used for addition.
- *
- */
-extern void hl_matrix_collect_shared_bias(real* B_d,
-                                          real* A_d,
-                                          const int channel,
-                                          const int dimM,
-                                          const int dimN,
-                                          real scale);
-
-/**
- * @brief  Matrix rotation in 90 degrees
- *
- * @param[in]   mat       input matrix (M x N).
- * @param[out]  matRot    output matrix (N x M).
- * @param[in]   dimM      input matrix height.
- * @param[in]   dimN      input matrix width.
- * @param[in]   clockWise rotation direction
- */
-extern void hl_matrix_rotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
-
-/**
- * @brief  Matrix vol2Col: Convert 3D volume into col matrix
- *
- * @param[in]   matSrc     input matrix.
- * @param[in]   channel    channel of matSrc.
- * @param[in]   depth      depth of matSrc.
- * @param[in]   height     height of matSrc.
- * @param[in]   width      width of matSrc.
- * @param[in]   filterD    depth of filter.
- * @param[in]   filterH    height of filter.
- * @param[in]   filterW    width of filter.
- * @param[in]   strideD    stride in the depth.
- * @param[in]   strideH    stride in the height.
- * @param[in]   strideW    stride in the width.
- * @param[in]   paddingD   padding in the depth.
- * @param[in]   paddingH   padding in the height.
- * @param[in]   paddingW   padding in the width.
- * @param[out]   dataDst     output matrix.
- *
- */
-extern void hl_matrix_vol2Col(const real* dataSrc,
-                              int channels,
-                              int depth,
-                              int height,
-                              int width,
-                              int filterD,
-                              int filterH,
-                              int filterW,
-                              int strideD,
-                              int strideH,
-                              int strideW,
-                              int paddingD,
-                              int paddingH,
-                              int paddingW,
-                              real* dataDst);
-
-/**
- * @brief  Matrix col2Vol: Convert col matrix into 3D volume
- *
- * @param[out]  matDst     output matrix.
- * @param[in]   channel    channel of matDst.
- * @param[in]   depth      depth of matDst.
- * @param[in]   height     height of matDst.
- * @param[in]   width      width of matDst.
- * @param[in]   filterD    depth of filter.
- * @param[in]   filterH    height of filter.
- * @param[in]   filterW    width of filter.
- * @param[in]   strideD    stride in the depth.
- * @param[in]   strideH    stride in the height.
- * @param[in]   strideW    stride in the width.
- * @param[in]   paddingD   padding in the depth.
- * @param[in]   paddingH   padding in the height.
- * @param[in]   paddingW   padding in the width.
- * @param[in]   matSrc     input matrix.
- * @param[in]   beta       input
- * @param[in]   alpha      input
- *
- */
-extern void hl_matrix_col2Vol(real* dataDst,
-                              int channels,
-                              int depth,
-                              int height,
-                              int width,
-                              int filterD,
-                              int filterH,
-                              int filterW,
-                              int strideD,
-                              int strideH,
-                              int strideW,
-                              int paddingD,
-                              int paddingH,
-                              int paddingW,
-                              const real* dataSrc,
-                              real alpha,
-                              real beta);
-
-/**
- * @brief  Matrix col2Vol: Convert col matrix into 3D volume
- * @param[out]  out     output int vector.
- * @param[in]   vec     input float vector.
- * @param[in]   size    size of the vector.
- */
-extern void hl_vector_cast2int(int* out, real* vec, int size);
-
-#endif /* HL_MATRIX_H_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_apply.cuh b/paddle/legacy/cuda/include/hl_matrix_apply.cuh
deleted file mode 100644
index a067c8233..000000000
--- a/paddle/legacy/cuda/include/hl_matrix_apply.cuh
+++ /dev/null
@@ -1,423 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_APPLY_H_
-#define HL_MATRIX_APPLY_H_
-
-#include "hl_base.h"
-#include "hl_cpu_matrix_kernel.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
-
-/**
- * @brief   CPU element wise unary operator.
- *
- *  element wise op(a) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * @param[in]       op          unary op. see namespace unary
- * @param[in,out]   A_h         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- *
- */
-template <class T, class Op>
-extern void hl_cpu_apply_unary_op(Op op,
-                                  T* A_h,
-                                  int dimM,
-                                  int dimN,
-                                  int lda);
-
-/**
- * @brief   CPU element wise binary operator.
- *
- * element wise op(a, b) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * if (BAsRowVector == 0 && BAsColVector == 0)
- *   op(A[i * lda + j], B[i * ldb + j])
- *
- * if (BAsRowVector == 1 && BAsColVector == 0)
- *   op(A[i * lda + j], B[j])
- *
- * if (BAsRowVector == 0 && BAsColVector == 1)
- *   op(A[i * lda + j], B[i * ldb])
- *
- * if (BAsRowVector == 1 && BAsColVector == 1)
- *   op(A[i * lda + j], B[0])
- *
- * @param[in]       op          binary op. see namespace binary.
- * @param[in,out]   A_h         matrix.
- * @param[in,out]   B_h         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- *
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-extern void hl_cpu_apply_binary_op(Op op,
-                                   T* A_h,
-                                   T* B_h,
-                                   int dimM,
-                                   int dimN,
-                                   int lda,
-                                   int ldb);
-
-/**
- * @brief   CPU element wise ternary operator.
- *
- * element wise op(a, b, c) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * if (CAsRowVector == 0 && CAsColVector == 0)
- *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
- *
- * if (CAsRowVector == 1 && CAsColVector == 0)
- *   op(A[i*lda + j], B[i*ldb + j], C[j])
- *
- * if (CAsRowVector == 0 && CAsColVector == 1)
- *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
- *
- * if (CAsRowVector == 1 && CAsColVector == 1)
- *   op(A[i*lda + j], B[i*ldb + j], C[0])
- *
- * @param[in]       op          ternary op. see namespace ternary.
- * @param[in,out]   A_h         matrix.
- * @param[in,out]   B_h         matrix.
- * @param[in,out]   C_h         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- * @param[in]       ldc         leading dimension of C.
- *
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-extern void hl_cpu_apply_ternary_op(Op op,
-                                    T* A_h,
-                                    T* B_h,
-                                    T* C_h,
-                                    int dimM,
-                                    int dimN,
-                                    int lda,
-                                    int ldb,
-                                    int ldc);
-
-/**
- * @brief   CPU element wise quaternary operator.
- *          element wise op(a, b, c, d) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * @param[in]       op          quaternary op. see namespace ternary.
- * @param[in,out]   A_h         matrix.
- * @param[in,out]   B_h         matrix.
- * @param[in,out]   C_h         matrix.
- * @param[in,out]   D_h         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- * @param[in]       ldc         leading dimension of C.
- * @param[in]       ldd         leading dimension of D.
- *
- */
-template <class T, class Op>
-extern void hl_cpu_apply_quaternary_op(Op op,
-                                       T* A_h,
-                                       T* B_h,
-                                       T* C_h,
-                                       T* D_h,
-                                       int dimM,
-                                       int dimN,
-                                       int lda,
-                                       int ldb,
-                                       int ldc,
-                                       int ldd);
-
-/**
- * @brief   GPU element wise unary operator.
- *          element wise op(a) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * @param[in]       op          unary op. see namespace unary.
- * @param[in,out]   A_d         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- *
- */
-template <class T, class Op>
-extern void hl_gpu_apply_unary_op(Op op,
-                                  T* A_d,
-                                  int dimM,
-                                  int dimN,
-                                  int lda);
-
-/**
- * @brief   GPU element wise binary operator.
- *
- * element wise op(a, b) for 0 <= i < dimM & for 0 <= j < dimN
- *
- * if (BAsRowVector == 0 && BAsColVector == 0)
- *   op(A[i * lda + j], B[i * ldb + j])
- *
- * if (BAsRowVector == 1 && BAsColVector == 0)
- *   op(A[i * lda + j], B[j])
- *
- * if (BAsRowVector == 0 && BAsColVector == 1)
- *   op(A[i * lda + j], B[i * ldb])
- *
- * if (BAsRowVector == 1 && BAsColVector == 1)
- *   op(A[i * lda + j], B[0])
- *
- * @param[in]       op          binary op. see namespace binary.
- * @param[in,out]   A_d         matrix.
- * @param[in,out]   B_d         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- *
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-extern void hl_gpu_apply_binary_op(Op op,
-                                   T* A_d,
-                                   T* B_d,
-                                   int dimM,
-                                   int dimN,
-                                   int lda,
-                                   int ldb);
-/**
- * @brief   GPU element wise ternary operator.
- *
- * element wise op(a, b, c) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * if (CAsRowVector == 0 && CAsColVector == 0)
- *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
- *
- * if (CAsRowVector == 1 && CAsColVector == 0)
- *   op(A[i*lda + j], B[i*ldb + j], C[j])
- *
- * if (CAsRowVector == 0 && CAsColVector == 1)
- *   op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
- *
- * if (CAsRowVector == 1 && CAsColVector == 1)
- *   op(A[i*lda + j], B[i*ldb + j], C[0])
- *
- * @param[in]       op          ternary op. see namespace ternary.
- * @param[in,out]   A_d         matrix.
- * @param[in,out]   B_d         matrix.
- * @param[in,out]   C_d         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- * @param[in]       ldc         leading dimension of C.
- *
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-extern void hl_gpu_apply_ternary_op(Op op,
-                                    T* A_d,
-                                    T* B_d,
-                                    T* C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int lda,
-                                    int ldb,
-                                    int ldc);
-
-
-/**
- * @brief   GPU element wise quaternary operator.
- *          element wise op(a, b, c, d) for 0 <= i < dimM & for 0 <= j < dimN.
- *
- * @param[in]       op          quaternary op. see namespace ternary.
- * @param[in,out]   A_d         matrix.
- * @param[in,out]   B_d         matrix.
- * @param[in,out]   C_d         matrix.
- * @param[in,out]   D_d         matrix.
- * @param[in]       dimM        matrix height.
- * @param[in]       dimN        matrix width.
- * @param[in]       lda         leading dimension of A.
- * @param[in]       ldb         leading dimension of B.
- * @param[in]       ldc         leading dimension of C.
- * @param[in]       ldd         leading dimension of D.
- *
- */
-template <class T, class Op>
-extern void hl_gpu_apply_quaternary_op(Op op,
-                                       T* A_d,
-                                       T* B_d,
-                                       T* C_d,
-                                       T* D_d,
-                                       int dimM,
-                                       int dimN,
-                                       int lda,
-                                       int ldb,
-                                       int ldc,
-                                       int ldd);
-
-/**
- * @brief  CPU matrix row operator.
- */
-template <class Agg, class Op, class Saver>
-extern void hl_cpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst, int ld,
-                                 real *A, int lda);
-
-/**
- * @brief  CPU matrix row operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  ld     leading dimension of dst matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- * @param[in]  *B     matrix B.
- * @param[in]  ldb    leading dimension of matrix B.
- *
- */
-template <class Saver, class Agg, class Op>
-extern void hl_cpu_matrix_row_op(Agg agg, Op op,
-                                 int dimM, int dimN,
-                                 real *dst, int ld,
-                                 real *A, int lda,
-                                 real *B, int ldb);
-
-/**
- * @brief  CPU matrix column operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                                    int dimM, int dimN,
-                                    real *dst,
-                                    real *A, int lda);
-
-/**
- * @brief  CPU matrix column operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- * @param[in]  *B     matrix B.
- * @param[in]  ldb    leading dimension of matrix B.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_cpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                                    int dimM, int dimN,
-                                    real *dst,
-                                    real *A, int lda,
-                                    real *B, int ldb);
-
-/**
- * @brief  GPU matrix row operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  ld     leading dimension of dst.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst, int ld,
-                                 real *A, int lda);
-
-/**
- * @brief  GPU matrix row operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  ld     leading dimension of dst matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- * @param[in]  *B     matrix B.
- * @param[in]  ldb    leading dimension of matrix B.
- *
- */
-template <class Saver, class Agg, class Op>
-extern void hl_gpu_matrix_row_op(Agg agg, Op op,
-                                 int dimM, int dimN,
-                                 real *dst, int ld,
-                                 real *A, int lda,
-                                 real *B, int ldb);
-
-/**
- * @brief  GPU matrix column operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                                    int dimM, int dimN,
-                                    real *dst,
-                                    real *A, int lda);
-
-/**
- * @brief  GPU matrix column operator.
- *
- * @param[in]  agg    aggregate operator expression.
- * @param[in]  op     operator expression.
- * @param[in]  sv     assignment operator expression.
- * @param[in]  dimM   matrix height.
- * @param[in]  dimN   matrix width.
- * @param[out] dst    destination matrix.
- * @param[in]  *A     matrix A.
- * @param[in]  lda    leading dimension of matrix A.
- * @param[in]  *B     matrix B.
- * @param[in]  ldb    leading dimension of matrix B.
- *
- */
-template <class Agg, class Op, class Saver>
-extern void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                                    int dimM, int dimN,
-                                    real *dst,
-                                    real *A, int lda,
-                                    real *B, int ldb);
-
-#endif /* HL_MATRIX_APPLY_H_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_base.cuh b/paddle/legacy/cuda/include/hl_matrix_base.cuh
deleted file mode 100644
index a309bb001..000000000
--- a/paddle/legacy/cuda/include/hl_matrix_base.cuh
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_BASE_CUH_
-#define HL_MATRIX_BASE_CUH_
-
-#include "hl_matrix_type.cuh"
-
-class BaseOp {
-public:
-  static const bool sse = false;
-  BaseOp() {}
-  explicit BaseOp(const real s1) {}
-  explicit BaseOp(const real s1, const real s2) {}
-  INLINE vecType vecOp(const vecType a) const {
-    return a;
-  }
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return a;
-  }
-};
-
-#ifdef __CUDA_ARCH__
-typedef BaseOp SSESum;
-typedef BaseOp SSEMax;
-typedef BaseOp SSEMin;
-typedef BaseOp SSEIdentity;
-typedef BaseOp SSEAdd;
-typedef BaseOp SSEAdd2;
-typedef BaseOp SSESub;
-typedef BaseOp SSEMul;
-typedef BaseOp SSEDiv;
-typedef BaseOp SSESquaredDiff;
-typedef BaseOp SSEFirst;
-typedef BaseOp SSESecond;
-typedef BaseOp SSEClassificationError;
-#else
-#include "hl_matrix_base_detail.cuh"
-#endif
-
-namespace aggregate {
-class sum : public SSESum {
-public:
-  INLINE real init() { return 0.0f; }
-  INLINE real operator()(const real a, const real b) const {
-    return a + b;
-  }
-};
-
-class max : public SSEMax {
-public:
-  INLINE real init() { return -HL_FLOAT_MAX; }
-  INLINE real operator()(const real a, const real b) const {
-    return a > b ? a : b;
-  }
-};
-
-class min : public SSEMin {
-public:
-  INLINE real init() {return HL_FLOAT_MAX;}
-  INLINE real operator()(const real a, const real b) const {
-    return a > b ? b : a;
-  }
-};
-}  // namespace aggregate
-
-namespace base {
-namespace unary {
-class identity : public SSEIdentity {
-public:
-  INLINE real operator()(const real a) const {
-    return a;
-  }
-};
-}  // namespace unary
-
-namespace binary {
-class add : public SSEAdd {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return a + b;
-  }
-};
-
-class add2 : public SSEAdd2 {
-private:
-  const real p1;
-  const real p2;
-public:
-  add2(const real s1, const real s2)
-    : SSEAdd2(s1, s2), p1(s1), p2(s2) {}
-  INLINE real operator()(const real a, const real b) const {
-    return p1 * a + p2 * b;
-  }
-};
-
-class sub : public SSESub {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return a - b;
-  }
-};
-
-class mul : public SSEMul {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return a * b;
-  }
-};
-
-class div : public SSEDiv {
-public:
-  INLINE real operator()(const real a, const real b) const  {
-    return a / b;
-  }
-};
-
-class squaredDiff : public SSESquaredDiff {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return (a - b) * (a - b);
-  }
-};
-
-class first : public SSEFirst {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return a;
-  }
-};
-
-class second : public SSESecond {
-public:
-  INLINE real operator()(const real a, const real b) const {
-    return b;
-  }
-};
-
-class classificationError : public SSEClassificationError {
-private:
-  const real p;
-public:
-  explicit classificationError(const real s)
-    : SSEClassificationError(s), p(s) {}
-  INLINE real operator()(const real a, const real b) const {
-    return ((a > p) == (b > p)) ? 0.0f : 1.0f;
-  }
-};
-}  // namespace binary
-}  // namespace base
-
-#endif /* HL_MATRIX_BASE_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_base_detail.cuh b/paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
deleted file mode 100644
index 74211bcb9..000000000
--- a/paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_MATRIX_BASE_DETAIL_CUH_
-#define HL_MATRIX_BASE_DETAIL_CUH_
-
-#include "hl_matrix_type.cuh"
-#include "hl_tensor_ops.h"
-
-namespace aggregate {
-class SSESum {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::add<vecType>()(a, b);
-  }
-};
-
-class SSEMax {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::max<vecType>()(a, b);
-  }
-};
-
-class SSEMin {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::min<vecType>()(a, b);
-  }
-};
-}  // namespace aggregate
-
-namespace base {
-namespace unary {
-class SSEIdentity {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a) const {
-    return a;
-  }
-};
-}  // namespace unary
-
-namespace binary {
-class SSEAdd {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::add<vecType>()(a, b);
-  }
-};
-
-class SSEAdd2 {
-public:
-  static const bool sse = VECTOR_SIMD;
-  const real p1;
-  const real p2;
-  vecType mp1;
-  vecType mp2;
-
-public:
-  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
-    mp1 = hl_vec_set(p1);
-    mp2 = hl_vec_set(p2);
-  }
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::add_scale<vecType>(mp1, mp2)(a, b);
-  }
-};
-
-class SSESub {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::sub<vecType>()(a, b);
-  }
-};
-
-class SSEMul {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::mul<vecType>()(a, b);
-  }
-};
-
-class SSEDiv {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hppl::binary::div<vecType>()(a, b);
-  }
-};
-
-class SSESquaredDiff {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    vecType tmp = hppl::binary::sub<vecType>()(a, b);
-    return hppl::binary::mul<vecType>()(tmp, tmp);
-  }
-};
-
-class SSEFirst {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return a;
-  }
-};
-
-class SSESecond {
-public:
-  static const bool sse = VECTOR_SIMD;
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return b;
-  }
-};
-
-class SSEClassificationError {
-public:
-  static const bool sse = VECTOR_SIMD;
-  const real p;
-  vecType mp;
-  vecType result;
-
-public:
-  explicit SSEClassificationError(const real s) : p(s) {
-    mp = hl_vec_set(p);
-    result = hl_vec_set(1.0f);
-  }
-  INLINE vecType vecOp(const vecType a, const vecType b) const {
-    return hl_vec_classification_error(a, b, mp, result);
-  }
-};
-}  // namespace binary
-}  // namespace base
-
-#endif /* HL_MATRIX_BASE_DETAIL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_ops.cuh b/paddle/legacy/cuda/include/hl_matrix_ops.cuh
deleted file mode 100644
index 4e8bd9123..000000000
--- a/paddle/legacy/cuda/include/hl_matrix_ops.cuh
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_MATRIX_OPS_CUH_
-#define HL_MATRIX_OPS_CUH_
-
-#include "hl_base.h"
-
-#ifdef __NVCC__
-#define HL_DEVICE   __device__
-#else
-#define HL_DEVICE
-#endif
-
-/**
- * @brief   parameter macro.
- */
-#define ONE_PARAMETER(name)     \
-        private: \
-          const T p;\
-        public: \
-          name(const T s) : p(s) {}
-
-#define TWO_PARAMETER(name)     \
-        private: \
-          const T p1;\
-          const T p2;\
-        public: \
-          name(const T s1, T s2) : p1(s1), p2(s2) {}
-
-#define THREE_PARAMETER(name)     \
-        private: \
-          const T p1;\
-          const T p2;\
-          const T p3;\
-        public: \
-          name(const T s1, T s2, T s3) : p1(s1), p2(s2), p3(s3) {}
-
-#define FOUR_PARAMETER(name)     \
-        private: \
-          const T p1;\
-          const T p2;\
-          const T p3;\
-          const T p4;\
-        public: \
-          name(const T s1, T s2, T s3, T s4) : p1(s1), p2(s2), p3(s3), p4(s4) {}
-
-/**
- * @brief   unary operator macro.
- *
- * @param   name    operator name.
- * @param   op      operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b
- *
- * @see    hl_gpu_apply_unary_op
- * @see    hl_cpu_apply_unary_op
- */
-#define DEFINE_MATRIX_UNARY_OP(name, op) \
-    namespace unary {\
-    template<class T>\
-    class name {\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a) {op;}\
-        inline void cpuOperator(T &a) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   unary operator macro.
- *
- * @param   name        operator name.
- * @param   PARA_MACRO  parameter macro.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b
- *
- * @see    hl_gpu_apply_unary_op
- * @see    hl_cpu_apply_unary_op
- */
-#define DEFINE_MATRIX_UNARY_PARAMETER_OP(name, PARA_MACRO, op) \
-    namespace unary {\
-    template<class T>\
-    class name {\
-    PARA_MACRO(name)\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a) {op;}\
-        inline void cpuOperator(T &a) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   binary operator macro.
- *
- * @param   name    operator name.
- * @param   op      operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b
- *
- * @see    hl_gpu_apply_unary_op
- * @see    hl_cpu_apply_unary_op
- */
-#define DEFINE_MATRIX_BINARY_OP(name, op) \
-    namespace binary {\
-    template<class T>\
-    class name {\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a, T &b) {op;}\
-        inline void cpuOperator(T &a, T &b) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   binary operator macro.
- *
- * @param   name        operator name.
- * @param   PARA_MACRO  parameter macro.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b
- *
- * @see    hl_gpu_apply_binary_op
- * @see    hl_cpu_apply_binary_op
- */
-#define DEFINE_MATRIX_BINARY_PARAMETER_OP(name, PARA_MACRO, op) \
-    namespace binary {\
-    template<class T>\
-    class name {\
-    PARA_MACRO(name)\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a, T &b) {op;}\
-        inline void cpuOperator(T &a, T &b) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   ternary operator macro.
- *
- * @param   name    operator name.
- * @param   op      operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b, c
- *
- * @see    hl_gpu_apply_ternary_op
- * @see    hl_cpu_apply_ternary_op
- */
-#define DEFINE_MATRIX_TERNARY_OP(name, op) \
-    namespace ternary {\
-    template<class T>\
-    class name {\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a, T &b, T &c) {op;}\
-        inline void cpuOperator(T &a, T &b, T &c) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   ternary operator macro.
- *
- * @param   name        operator name.
- * @param   PARA_MACRO  parameter macro.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b, c
- *
- * @see    hl_gpu_apply_ternary_op
- * @see    hl_cpu_apply_ternary_op
- */
-#define DEFINE_MATRIX_TERNARY_PARAMETER_OP(name, PARA_MACRO, op) \
-    namespace ternary {\
-    template<class T>\
-    class name {\
-    private:\
-    PARA_MACRO(name)\
-    public:\
-        HL_DEVICE inline void gpuOperator(T &a, T &b, T &c) {op;}\
-        inline void cpuOperator(T &a, T &b, T &c) {op;}\
-    };\
-    }
-
-
-/**
- * @brief   quaternary operator macro.
- *
- * @param   name        operator name.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b, c, d
- *
- * @see    hl_gpu_apply_quaternary_op
- * @see    hl_cpu_apply_quaternary_op
- */
-#define DEFINE_MATRIX_QUATERNARY_OP(name, op)     \
-  namespace quaternary {\
-  template<class T>\
-  class name {\
-   public:\
-   HL_DEVICE inline void gpuOperator(T &a, T &b, T &c, T &d) {op;}\
-   inline void cpuOperator(T&a, T &b, T &c, T &d) {op;}\
-  };\
-  }
-
-
-/**
- * @brief   quaternary operator macro.
- *
- * @param   name        operator name.
- * @param   PARA_MACRO  parameter macro.
- * @param   op          operator expression.
- *
- * @note   op format: op supports multiple expressions that are separated
- *         by a comma. e.g. a, b, c, d
- *
- * @see    hl_gpu_apply_quaternary_op
- * @see    hl_cpu_apply_quaternary_op
- */
-#define DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(name, PARA_MACRO, op)     \
-  namespace quaternary {\
-  template<class T>\
-  class name {\
-   private:\
-   PARA_MACRO(name)\
-   public:\
-   HL_DEVICE inline void gpuOperator(T &a, T &b, T &c, T &d) {op;}\
-   inline void cpuOperator(T &a, T &b, T &c, T &d) {op;}\
-  };\
-  }
-
-#endif /* HL_MATRIX_OPS_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_matrix_type.cuh b/paddle/legacy/cuda/include/hl_matrix_type.cuh
deleted file mode 100644
index e61c0d0a4..000000000
--- a/paddle/legacy/cuda/include/hl_matrix_type.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_MATRIX_TYPE_CUH_
-#define HL_MATRIX_TYPE_CUH_
-
-#include "hl_base.h"
-
-#ifdef __CUDA_ARCH__
-/**
- * CUDA kernel inline function
- */
-#define INLINE   __device__ inline
-#else
-/**
- * CPP inline function
- */
-#define INLINE   inline
-#endif
-
-#ifdef __CUDA_ARCH__
-#include <vector_types.h>
-#ifndef PADDLE_TYPE_DOUBLE
-typedef float4 vecType;
-#else
-typedef double2 vecType;
-#endif
-#elif defined(__SSE3__)
-#include "hl_cpu_simd_sse.cuh"
-#define PADDLE_USE_SSE3
-#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && !defined(__NVCC__)
-// Currently nvcc does not support neon intrinsic.
-// TODO: Extract simd intrinsic implementation from .cu files.
-#include "hl_cpu_simd_neon.cuh"
-#define PADDLE_USE_NEON
-#else
-#include "hl_cpu_scalar.cuh"
-#endif
-
-#endif  // HL_MATRIX_TYPE_CUH_
diff --git a/paddle/legacy/cuda/include/hl_perturbation_util.cuh b/paddle/legacy/cuda/include/hl_perturbation_util.cuh
deleted file mode 100644
index e0a27778c..000000000
--- a/paddle/legacy/cuda/include/hl_perturbation_util.cuh
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef DISTRUB_UTIL_CUH_
-#define DISTRUB_UTIL_CUH_
-
-#include "hl_base.h"
-
-/*
- * Functionality: randomly rotate, scale and sample a minibatch of images
-                  and their label maps
- * images:            (numImages, imgPixels, 3)
- * targets:           (numImages, imgPixels, 3)
- *
- * created by Wei Xu. Converted to paddle by Jiang Wang.
- */
-void hl_conv_random_disturb(const real* images, int imgSize, int tgtSize,
-                            int channels, int numImages, real scaleRatio,
-                            real rotateAngle, int samplingRate,
-                            real* gpu_r_angle, real* gpu_s_ratio,
-                            int* gpu_center_r, int* gpu_center_c,
-                            int paddingValue, bool isTrain, real* targets);
-
-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
-                                        int tgtSize, int channels,
-                                        int numImages, int samplingRate,
-                                        const real* gpuRotationAngle,
-                                        const real* gpuScaleRatio,
-                                        const int* gpuCenterR,
-                                        const int* gpuCenterC,
-                                        int paddingValue, real* targets);
-
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
-                                int*& gpuCenterR, int*& gpuCenterC,
-                                int numImages, int imgSize,
-                                real rotateAngle, real scaleRatio,
-                                int samplingRate, bool isTrain);
-
-#endif /* DISTURB_UTIL_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_recurrent_apply.cuh b/paddle/legacy/cuda/include/hl_recurrent_apply.cuh
deleted file mode 100644
index b2cc231f5..000000000
--- a/paddle/legacy/cuda/include/hl_recurrent_apply.cuh
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_RECURRENT_APPLY_CUH_
-#define HL_RECURRENT_APPLY_CUH_
-
-#include "hl_base.h"
-#include "hl_activation_functions.h"
-#include "hl_lstm_ops.cuh"
-#include "hl_gpu_lstm.cuh"
-#include "hl_cpu_lstm.cuh"
-#include "hl_gru_ops.cuh"
-#include "hl_gpu_gru.cuh"
-#include "hl_cpu_gru.cuh"
-
-/**
- * @brief   Cpu lstm forward one sequence.
- *
- * @param[in]   op                  hl_lstm_ops.cuh
- * @param[out]  value               hl_lstm_value type.
- * @param[in]   frameSize           frame size.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- */
-template<class Op>
-extern void hl_cpu_lstm_forward(Op op,
-                                hl_lstm_value value,
-                                int frameSize,
-                                hl_activation_mode_t active_node,
-                                hl_activation_mode_t active_gate,
-                                hl_activation_mode_t active_state);
-
-/**
- * @brief   Cpu lstm backward one sequence.
- *
- * @param[in]   op                  hl_lstm_ops.cuh
- * @param[in]   value               lstm value.
- * @param[out]  grad                output gradient.
- * @param[in]   frameSize           frame size.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- */
-template<class Op>
-extern void hl_cpu_lstm_backward(Op op,
-                                 hl_lstm_value value,
-                                 hl_lstm_grad grad,
-                                 int frameSize,
-                                 hl_activation_mode_t active_node,
-                                 hl_activation_mode_t active_gate,
-                                 hl_activation_mode_t active_state);
-
-/**
- * @brief   Gpu lstm batch forward.
- *
- * @param[in]   op                  hl_lstm_ops.cuh
- * @param[out]  value               lstm value.
- * @param[in]   frameSize           frame size.
- * @param[in]   batchSize           size of current batch.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- */
-template<class Op>
-extern void hl_gpu_lstm_forward(Op op,
-                                hl_lstm_value value,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_node,
-                                hl_activation_mode_t active_gate,
-                                hl_activation_mode_t active_state);
-
-/**
- * @brief   Gpu lstm batch backward.
- *
- * @param[in]   op                  hl_lstm_ops.cuh
- * @param[out]  value               lstm value.
- * @param[out]  grad                lstm gradient.
- * @param[in]   frameSize           frame size.
- * @param[in]   batchSize           size of current batch.
- * @param[in]   active_node         active input type.
- * @param[in]   active_gate         active state type.
- * @param[in]   active_state        actvie gate type.
- */
-template<class Op>
-extern void hl_gpu_lstm_backward(Op op,
-                                 hl_lstm_value value,
-                                 hl_lstm_grad grad,
-                                 int frameSize,
-                                 int batchSize,
-                                 hl_activation_mode_t active_node,
-                                 hl_activation_mode_t active_gate,
-                                 hl_activation_mode_t active_state);
-
-/**
- * @brief   Cpu gru forward.
- *
- * @param[in]     opResetOutput   hl_gru_ops.cuh
- * @param[in]     opFinalOutput   hl_gru_ops.cuh
- * @param[in,out] value           gru value.
- * @param[in]     frameSize       frame length/size.
- * @param[in]     batchSize       size of current batch.
- * @param[in]     active_node     active input type.
- * @param[in]     active_gate     active state type.
- */
-template<class OpResetOutput, class OpFinalOutput>
-extern void hl_cpu_gru_forward(OpResetOutput opResetOutput,
-                               OpFinalOutput opFinalOutput,
-                               hl_gru_value value,
-                               int frameSize,
-                               int batchSize,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate);
-
-/**
- * @brief   Cpu gru forward.
- *
- * @param[in]     opStateGrad     hl_gru_ops.cuh
- * @param[in]     opResetGrad     hl_gru_ops.cuh
- * @param[in]     value           gru value.
- * @param[in,out] grad            gru gradient.
- * @param[in]     frameSize       frame length/size.
- * @param[in]     batchSize       size of current batch.
- * @param[in]     active_node     active input type.
- * @param[in]     active_gate     active state type.
- */
-template<class OpStateGrad, class OpResetGrad>
-extern void hl_cpu_gru_backward(OpStateGrad opStateGrad,
-                                OpResetGrad opResetGrad,
-                                hl_gru_value value,
-                                hl_gru_grad  grad,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_node,
-                                hl_activation_mode_t active_gate);
-
-/**
- * @brief   Gpu gru forward.
- *
- * @param[in]     opResetOutput   hl_gru_ops.cuh
- * @param[in]     opFinalOutput   hl_gru_ops.cuh
- * @param[in,out] value           gru value.
- * @param[in]     frameSize       frame length/size.
- * @param[in]     batchSize       size of current batch.
- * @param[in]     active_node     active input type.
- * @param[in]     active_gate     active state type.
- */
-template<class OpResetOutput, class OpFinalOutput>
-extern void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                               OpFinalOutput opFinalOutput,
-                               hl_gru_value value,
-                               int frameSize,
-                               int batchSize,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate);
-
-/**
- * @brief   Gpu gru forward.
- *
- * @param[in]     opStateGrad     hl_gru_ops.cuh
- * @param[in]     opResetGrad     hl_gru_ops.cuh
- * @param[in]     value           gru value.
- * @param[in,out] grad            gru gradient.
- * @param[in]     frameSize       frame length/size.
- * @param[in]     batchSize       size of current batch.
- * @param[in]     active_node     active input type.
- * @param[in]     active_gate     active state type.
- */
-template<class OpStateGrad, class OpResetGrad>
-extern void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                                OpResetGrad opResetGrad,
-                                hl_gru_value value,
-                                hl_gru_grad  grad,
-                                int frameSize,
-                                int batchSize,
-                                hl_activation_mode_t active_node,
-                                hl_activation_mode_t active_gate);
-
-#endif /* HL_RECURRENT_APPLY_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_sequence.h b/paddle/legacy/cuda/include/hl_sequence.h
deleted file mode 100644
index 3923bdd92..000000000
--- a/paddle/legacy/cuda/include/hl_sequence.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_SEQUENCE_H_
-#define HL_SEQUENCE_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Maximum sequence forward.
- *
- * @param[in]   input           each sequence contains some instances.
- * @param[in]   sequence        sequence index..
- * @param[out]  output          max instance in this sequence.
- * @param[out]  index           index of max instance.
- * @param[in]   numSequences    size of sequence[in].
- * @param[in]   dim             input dimension.
- *
- */
-extern void hl_max_sequence_forward(real* input,
-                                    const int* sequence,
-                                    real* output,
-                                    int* index,
-                                    int numSequences,
-                                    int dim);
-
-/**
- * @brief   Maximum sequence backward.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   index           index of max instance.
- * @param[out]  inputGrad       input gradient.
- * @param[in]   numSequences    size of sequence[in].
- * @param[in]   dim             input dimension.
- *
- */
-extern void hl_max_sequence_backward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
-
-/**
- * @brief   Memory copy from sequence to batch.
- *
- * if seq2batch == true
- *
- *    copy from sequence to batch: batch[i] = sequence[batchIndex[i]].
- *
- * if seq2batch == false
- *
- *    copy from batch to sequence: sequence[batchIndex[i]] = batch[i].
- *
- * @param[in,out]   batch       batch matrix.
- * @param[in,out]   sequence    equence matrix.
- * @param[in]       batchIndex  index vector.
- * @param[in]       seqWidth    width of sequence.
- * @param[in]       batchCount  number of batchIndex.
- * @param[in]       seq2batch   copy direction.
- *
- */
-extern void hl_sequence2batch_copy(real* batch,
-                                   real* sequence,
-                                   const int* batchIndex,
-                                   int seqWidth,
-                                   int batchCount,
-                                   bool seq2batch);
-
-/**
- * @brief   Add sequence to batch.
- *
- * if seq2batch == true
- *
- *    add sequence to batch: batch[i] = sequence[batchIndex[i]].
- *
- * if seq2batch == false
- *
- *    add batch to sequence: sequence[batchIndex[i]] = batch[i].
- *
- * @param[in,out]   batch       batch matrix.
- * @param[in,out]   sequence    equence matrix.
- * @param[in]       batchIndex  index vector.
- * @param[in]       seqWidth    width of sequence.
- * @param[in]       batchCount  number of batchIndex.
- * @param[in]       seq2batch   copy direction.
- *
- */
-extern void hl_sequence2batch_add(real* batch,
-                                  real* sequence,
-                                  int* batchIndex,
-                                  int seqWidth,
-                                  int batchCount,
-                                  bool seq2batch);
-
-/**
- * @brief   Memory copy from sequence to batch,
- *          while padding all sequences to the same length.
- *
- * if seq2batch == true
- *
- *    copy from sequence to batch:
- *        batch[i] = sequence[sequenceStartPositions[i]]
- *
- * if seq2batch == false
- *
- *    copy from batch to sequence:
- *        sequence[sequenceStartPositions[i]] = batch[i]
- *
- * @param[in,out]   batch                   batch matrix.
- * @param[in,out]   sequence                sequence matrix.
- * @param[in]       sequenceStartPositions  index vector.
- * @param[in]       sequenceWidth           width of sequence.
- * @param[in]       maxSequenceLength       maximum length of sequences.
- * @param[in]       numSequences            number of sequences.
- * @param[in]       normByTimes             whether dividing sequence's length.
- * @param[in]       seq2batch               copy direction.
- *
- */
-extern void hl_sequence2batch_copy_padding(real* batch,
-                                           real* sequence,
-                                           const int* sequenceStartPositions,
-                                           const size_t sequenceWidth,
-                                           const size_t maxSequenceLength,
-                                           const size_t numSequences,
-                                           bool normByTimes,
-                                           bool seq2batch);
-
-/**
- * @brief  dst = Op(src), src is sequence.
- *
- * mode = 0, Op is average.
- *
- * mode = 1, Op is sum.
- *
- * mode = 2, Op is sum(src)/sqrt(N), N is sequence length.
- *
- * @param[in,out]   dst       destination data.
- * @param[in]       src       source data.
- * @param[in]       starts    sequence start positions.
- * @param[in]       height    height of dst data.
- * @param[in]       width     width of dst data.
- * @param[in]       mode      0: avreage,
- *                            1: sum,
- *                            2: divide by square root
- *                            of sequenceLength
- */
-extern void hl_sequence_avg_forward(real* dst,
-                                    real* src,
-                                    const int* starts,
-                                    int height,
-                                    int width,
-                                    const int mode);
-
-extern void hl_sequence_avg_backward(real* dst,
-                                     real* src,
-                                     const int* starts,
-                                     int height,
-                                     int width,
-                                     const int mode);
-#endif /* HL_SEQUENCE_H_ */
diff --git a/paddle/legacy/cuda/include/hl_sparse.h b/paddle/legacy/cuda/include/hl_sparse.h
deleted file mode 100644
index 9aab52e04..000000000
--- a/paddle/legacy/cuda/include/hl_sparse.h
+++ /dev/null
@@ -1,523 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_SPARSE_H_
-#define HL_SPARSE_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Malloc a sparse matrix.
- *
- * @param[out]  A_d        sparse matrix.
- * @param[in]   format     format.
- * @param[in]   value_type valueType.
- * @param[in]   dimM       height.
- * @param[in]   dimN       width.
- * @param[in]   nnz        number of none zero element.
- *
- */
-extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                    hl_matrix_format_t format,
-                                    hl_matrix_value_t value_type,
-                                    int dimM,
-                                    int dimN,
-                                    int nnz);
-
-/**
- * @brief   Free a sparse matrix.
- *
- * @param[in]  A_d  GPU sparse matrix.
- *
- */
-extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d);
-
-/**
- * @brief   Construct a sparse matrix use input gpu memory.
- *
- * @param[out]  A_d         sparse matrix.
- * @param[in]   dest_d      gpu memory.
- * @param[in]   size        size of dest_d.
- * @param[in]   format      format.
- * @param[in]   value_type  valueType.
- * @param[in]   dimM        height.
- * @param[in]   dimN        width.
- * @param[in]   nnz         number of none zero element.
- *
- * @note    Destruct api is hl_destruct_sparse_matrix.
- *
- */
-extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void *dest_d,
-                                       size_t size,
-                                       hl_matrix_format_t format,
-                                       hl_matrix_value_t value_type,
-                                       int dimM,
-                                       int dimN,
-                                       int nnz);
-
-/**
- * @brief   Use three arrays to construct sparse matrix.
- *
- * if format is HL_SPARSE_CSR, size of rows_d is dimM + 1,
- * and size of cols_d is nnz;
- *
- * if format is HL_SPARSE_CSC, size of rows_d is nnz, and size of
- * cols_d is dimN + 1.
- *
- * if valueType is HL_NO_VALUE, size of value_d is zero,
- * else size of value_d is nnz.
- *
- * @param[out]  A_d        sparse matrix.
- * @param[in]   value_d    value.
- * @param[in]   rows_d     row.
- * @param[in]   cols_d     col.
- * @param[in]   format     format.
- * @param[in]   value_type valueType.
- * @param[in]   dimM       height.
- * @param[in]   dimN       width.
- * @param[in]   nnz        number of none zero element.
- *
- * @note    The corresponding destructor interface is hl_destruct_sparse_matrix.
- *
- */
-extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real *value_d,
-                                       int *rows_d,
-                                       int *cols_d,
-                                       hl_matrix_format_t format,
-                                       hl_matrix_value_t value_type,
-                                       int dimM,
-                                       int dimN,
-                                       int nnz);
-
-/**
- * @brief   Destruct sparse matrix.
- *
- * @param[in] A_d  sparse matrix.
- *
- */
-extern void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d);
-
-/**
- * @brief   Copy value & index to sparse matrix.
- *
- * if csr_matrix is HL_FLOAT_VALUE.
- *
- *  1. csr_val, csr_row, csr_col three pointers are not null.
- *
- *  2. csr_val is not null, csr_row adn csr_col are null.
- *
- * if csr_matrix is HL_NO_VALUE.
- *
- *  1. csr_val will be ignore, csr_row and csr_col are not null.
- *
- *
- * @param[in,out]   csr_matrix sparse matrix.
- * @param[in]       csr_val    point to csr value array(nnz).
- * @param[in]       csr_row    point to csr row indices array(dimM+1).
- * @param[in]       csr_col    point to csr col indices array(nnz).
- * @param[in]       stream     hl_stream_t type.
- *
- */
-extern void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
-                                 real *csr_val,
-                                 int *csr_row,
-                                 int *csr_col,
-                                 hl_stream_t stream);
-
-/**
- * @brief   Copy value & index to sparse matrix.
- *
- * if csr_matrix is HL_FLOAT_VALUE.
- *
- *   1. csc_val, csc_row, csc_col three pointers are not null.
- *
- *   2. csc_val is not null, csc_row and csc_col are null.
- *
- * if csr_matrix is HL_NO_VALUE.
- *
- *   1. csc_val will be ignore, csc_row and csc_col are not null.
- *
- * @param[in,out]   csc_matrix sparse matrix.
- * @param[in]       csc_val    point to csc value array(nnz).
- * @param[in]       csc_row    point to csc row indices array(nnz).
- * @param[in]       csc_col    point to csc col indices array(dimN+1).
- * @param[in]       stream     hl_stream_t type.
- *
- *
- */
-extern void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
-                                 real *csc_val,
-                                 int *csc_row,
-                                 int *csc_col,
-                                 hl_stream_t stream);
-
-/**
- * @brief   Copy sparse matrix to sparse matrix.
- *
- * @param[out]  dst     sparse matrix.
- * @param[in]   src     sparse matrix.
- * @param[in]   stream  hl_stream_t type.
- *
- *
- * @note    1. Format of the src matrix and dst matrix needs to be consistent.
- *          2. Source matrix has value, the destination matrix has value or
- *             no value can be; the source matrix is no value, then the
- *             destination matrix must also be no value;
- */
-extern void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
-                                    hl_sparse_matrix_s src,
-                                    hl_stream_t stream);
-
-/**
- * @brief   csr matrix to dense matrix.
- *
- * @param[in]   A_d     csr matrix.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    height.
- * @param[in]   dimN    width.
- *
- */
-extern void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN);
-
-/**
- * @brief   csc matrix to dense matrix.
- *
- * @param[in]   A_d     csc matrix.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    height.
- * @param[in]   dimN    width.
- *
- */
-extern void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- *
- * @param[in]   A_d     csr sparse matrix.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     dense matrix.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *                      If beta is zero, C does not have to be a valid input.
- *
- * @note    transb is not support HPPL_OP_T.
- *
- */
-extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
-                                    hl_trans_op_t transa,
-                                    real *B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- *
- * @param[in]   A_d     sparse matrix.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     dense matrix.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *                      If beta is zero, C does not have to be a valid input.
- *
- * @note    transb is not support HPPL_OP_T.
- *
- */
-extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
-                                    hl_trans_op_t transa,
-                                    real *B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- *
- * @param[in]   A_d     dense matrix.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     csc sparse matrix.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *                      If beta is zero, C does not have to be a valid input.
- *
- * @note    transa is not support HPPL_OP_T.
- *
- */
-extern void hl_matrix_dense_mul_csc(real *A_d,
-                                    hl_trans_op_t transa,
-                                    hl_sparse_matrix_s B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d.
- *          Calculated based on the non-zero elements of the matrix C.
- *
- * @param[in]     A_d     dense matrix.
- * @param[in]     transa  operation op(A) that is non-or transpose.
- * @param[in]     B_d     dense matrix.
- * @param[in]     transb  operation op(B) that is non-or transpose.
- * @param[in,out] C_d     sparse matrix.
- * @param[in]     dimM    matrix height of op(A) & C
- * @param[in]     dimN    matrix width of op(B) & C
- * @param[in]     dimK    width of op(A) & height of op(B)
- * @param[in]     alpha   scalar used for multiplication.
- * @param[in]     beta    scalar used for multiplication.
- *
- * @note    transb is not support HPPL_OP_T.
- *
- */
-extern void hl_sparse_matrix_mul(real *A_d,
-                                 hl_trans_op_t transa,
-                                 real *B_d,
-                                 hl_trans_op_t transb,
-                                 hl_sparse_matrix_s C_d,
-                                 int dimM,
-                                 int dimN,
-                                 int dimK,
-                                 real alpha,
-                                 real beta);
-
-/**
- * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
- *
- * @param[in]   A_d     dense matrix.
- * @param[in]   transa  operation op(A) that is non-or transpose.
- * @param[in]   B_d     sparse matrix.
- * @param[in]   transb  operation op(B) that is non-or transpose.
- * @param[out]  C_d     dense matrix.
- * @param[in]   dimM    matrix height of op(A) & C
- * @param[in]   dimN    matrix width of op(B) & C
- * @param[in]   dimK    width of op(A) & height of op(B)
- * @param[in]   alpha   scalar used for multiplication.
- * @param[in]   beta    scalar used for multiplication.
- *                      If beta is zero, C does not have to be a valid input.
- *
- *
- * @note    transa is not support HPPL_OP_T.
- *
- */
-extern void hl_matrix_dense_mul_csr(real *A_d,
-                                    hl_trans_op_t transa,
-                                    hl_sparse_matrix_s B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief   Memcpy csc_matrix to host.
- *
- * a. according to csc_matrix, update three arrays
- *
- *  1. csc_val, csc_row, csc_col are dest Address.
- *
- *  2. if type of csc_matrix is HL_NO_VALUE, update csc_row and csc_col
- *
- *  3. if type of csc_matrix is HL_FLOAT_VALUE, update csc_row,
- *     csc_col and csc_value.
- *
- * b. The interface is asynchronous copy. To ensure that the data is copied
- *     please call the synchronous interface;
- *
- *
- * @param[out]  csc_val     point to csc value array(nnz).
- * @param[in]   val_size    csc value size.
- * @param[out]  csc_row     point to csc row indices array(nnz).
- * @param[in]   row_size    csc row size.
- * @param[out]  csc_col     point to csc col indices array(dimN + 1).
- * @param[in]   col_size    csc column size.
- * @param[in]   csc_matrix  sparse matrix.
- * @param[in]   stream      hl_stream_t type.
- *
- */
-extern void hl_memcpy_from_csc_matrix(real *csc_val,
-                                      size_t val_size,
-                                      int *csc_row,
-                                      size_t row_size,
-                                      int *csc_col,
-                                      size_t col_size,
-                                      hl_sparse_matrix_s csc_matrix,
-                                      hl_stream_t stream);
-
-/**
- * @brief   Memcpy sparse matrix to host.
- *
- * a. according to csr_matrix, update three arrays
- *
- *  1. csr_val, csr_row, csr_col are dest Address.
- *
- *  2. if type of csr_matrix is HL_NO_VALUE, update csr_row and csr_col
- *
- *  3. if type of csr_matrix is HL_FLOAT_VALUE, update csr_row,
- *     csr_col and csr_value
- *
- * b. The interface is asynchronous copy. To ensure that the data is copied
- *     please call the synchronous interface;
- *
- * @param[out]  csr_val     point to csr value array(nnz).
- * @param[in]   val_size    csr value size.
- * @param[out]  csr_row     point to csr row indices array(nnz).
- * @param[in]   row_size    csr row size.
- * @param[out]  csr_col     point to csr col indices array(dimN + 1).
- * @param[in]   col_size    csr column size.
- * @param[in]   csr_matrix  sparse matrix.
- * @param[in]   stream      hl_stream_t type.
- *
- */
-extern void hl_memcpy_from_csr_matrix(real *csr_val,
-                                      size_t val_size,
-                                      int *csr_row,
-                                      size_t row_size,
-                                      int *csr_col,
-                                      size_t col_size,
-                                      hl_sparse_matrix_s csr_matrix,
-                                      hl_stream_t stream);
-
-/**
- * @brief   A_d[j] += B_d[i,j] for i in range(height)
- *
- * @param[in,out]   A_d    vector, size = width.
- * @param[in]       B_d    sparse matrix.
- * @param[in]       dimM   height.
- * @param[in]       dimN   width.
- * @param[in]       scale  scale of B_d
- *
- */
-extern void hl_sparse_matrix_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
-/**
- * @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum
- */
-extern void hl_matrix_csr_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale);
-
-/**
- * @brief   A_d[i,j] += B_d[j]
- *
- * @param[in,out]   A_d    sprare matrix.
- * @param[in]       B_d    vector, size = A_d.width.
- * @param[in]       scale  scale of B_d.
- *
- */
-extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real *B_d,
-                                      real scale);
-/**
- * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias
- */
-extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real *B_d,
-                                   real scale);
-
-/**
- * @brief   sparseMatrix = alpha * denseMatrix + beta *sparseMatrix
- *          A_d[i,j] = alpha * B_d[i,j] + beta * A_d[i,j]
- *          Only add value of same (row, col) index in dense matrix and
- *          do not use others values whoes postions are not in sparse matirx.
- *
- * @param[in,out]   A_d    sprare matrix.
- * @param[in]       B_d    dense matrix.
- * @param[in]       dimM   height of B_d.
- * @param[in]       dimN   width of B_d.
- * @param[in]       alpha  scale of B_d.
- * @param[in]       beta   scale of A_d.
- *
- */
-extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real *B_d,
-                                       int dimM,
-                                       int dimN,
-                                       real alpha,
-                                       real beta);
-/**
- * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense
- */
-extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real *B_d,
-                                    int dimM,
-                                    int dimN,
-                                    real alpha,
-                                    real beta);
-
-/**
- * @brief get rows pionter of GpuSparseMatrix
- *
- * @param[in]    sMat  sparse matrix
- *
- * @return   return rows pointer, which is gpu address
- *
- */
-extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat);
-
-/**
- * @brief get cols pionter of GpuSparseMatrix
- *
- * @param[in]    sMat  sparse matrix
- *
- * @return   return cols pointer, which is gpu address
- *
- */
-extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat);
-
-/**
- * @brief get value pionter of GpuSparseMatrix
- *
- * @param[in]    sMat  sparse matrix
- *
- * @return   return value pointer, which is gpu address
- *
- */
-extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat);
-
-#endif /* HL_SPARSE_H_ */
diff --git a/paddle/legacy/cuda/include/hl_sparse.ph b/paddle/legacy/cuda/include/hl_sparse.ph
deleted file mode 100644
index c0fdccb94..000000000
--- a/paddle/legacy/cuda/include/hl_sparse.ph
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_SPARSE_PH_
-#define HL_SPARSE_PH_
-
-#include "hl_base.h"
-
-/**
- * @brief   sparse matrix csr format.
- *
- * @param   *csr_val     nonzero values of matrix.
- * @param   *csr_row     row indices.
- * @param   *csr_col     column indices.
- * @param   nnz_s        sizeof of csr_val & csr_col.
- * @param   row_s        sizeof of csr_row.
- * @param   sparsity     sparsity pattern.
- *
- */
-typedef struct {
-    real                *csr_val;
-    int                 *csr_row;
-    int                 *csr_col;
-    size_t              nnz_s;
-    int                 row_s;
-    float               sparsity;
-}_hl_csr_matrix, *hl_csr_matrix;
-
-/**
- * @brief   sparse matrix csc format.
- *
- * @param   *csc_val      nonzero values of matrix.
- * @param   *csc_row      row indices.
- * @param   *csc_col      column indices.
- * @param   nnz_s         sizeof of csc_val & csc_row.
- * @param   col_s         sizeof of csc_col.
- * @param   sparsity      sparsity pattern.
- *
- */
-typedef struct {
-    real                *csc_val;
-    int                 *csc_row;
-    int                 *csc_col;
-    size_t              nnz_s;
-    int                 col_s;
-    float               sparsity;
-}_hl_csc_matrix, *hl_csc_matrix;
-
-#define __sparse_get_type_return__(mat, type, field)\
-  do {\
-    hl_##type##_matrix type##_d = (hl_##type##_matrix)((mat)->matrix);\
-    if (type##_d) {\
-      return type##_d -> type##_##field;\
-    } else {\
-      LOG(WARNING) << "parameter " <<  #field << "NULL error!";\
-      return NULL;\
-    }\
-  } while(0)
-
-#define __sparse_get_return__(mat, field)\
-  do {\
-    if ((mat) == NULL) {\
-      LOG(WARNING) << "parameter NULL error!";\
-      return NULL;\
-    }\
-    if ((mat)->format == HL_SPARSE_CSR) {\
-      __sparse_get_type_return__(mat, csr, field);\
-    } else {\
-      __sparse_get_type_return__(mat, csc, field);\
-    }\
-  } while(0)
-
-#endif  /* HL_SPARSE_PH_ */
diff --git a/paddle/legacy/cuda/include/hl_table_apply.h b/paddle/legacy/cuda/include/hl_table_apply.h
deleted file mode 100644
index dff60aa0a..000000000
--- a/paddle/legacy/cuda/include/hl_table_apply.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_TABLE_APPLY_H_
-#define HL_TABLE_APPLY_H_
-
-/**
- * @brief   Get row from table.
- *          output[i] += table[ids[i]]
- *          if ids[i] == -1, it will be ignored
- *
- * @param[out]  output          output matrix.
- * @param[in]   ldo             leading dimension of output.
- * @param[in]   table           table matrix.
- * @param[in]   ldt             leading dimension of table.
- * @param[in]   ids             ids vector.
- * @param[in]   numSamples      height of output.
- * @param[in]   tableSize       height of table.
- * @param[in]   dim             width of table.
- *
- */
-extern void hl_matrix_select_rows(real* output,
-                                  int ldo,
-                                  real* table,
-                                  int ldt,
-                                  int* ids,
-                                  int numSamples,
-                                  int tableSize,
-                                  int dim);
-
-/**
- * @brief   Add row to table.
- *          table[ids[i]] += output[i]
- *          if ids[i] == -1, it will be ignored
- *
- * @param[out]  table           table matrix.
- * @param[in]   ldt             leading dimension of table.
- * @param[in]   input           input matrix.
- * @param[in]   ldi             leading dimension of input.
- * @param[in]   ids             ids vector.
- * @param[in]   numSamples      height of input.
- * @param[in]   tableSize       height of table.
- * @param[in]   dim             width of table.
- *
- */
-extern void hl_matrix_add_to_rows(real* table,
-                                  int ldt,
-                                  real* input,
-                                  int ldi,
-                                  int* ids,
-                                  int numSamples,
-                                  int tableSize,
-                                  int dim);
-
-/**
- * @brief   Select element from vector.
- *
- * @param[out]  dst         output vector.
- * @param[in]   sized       size of dst.
- * @param[in]   src         input vector.
- * @param[in]   sizes       size of src.
- * @param[in]   ids         index vector.
- * @param[in]   sizei       size of ids.
- *
- */
-template <class T>
-extern void hl_vector_select_from(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei);
-
-#endif /* HL_TABLE_APPLY_H_ */
diff --git a/paddle/legacy/cuda/include/hl_tensor_ops.h b/paddle/legacy/cuda/include/hl_tensor_ops.h
deleted file mode 100644
index bc5e5da53..000000000
--- a/paddle/legacy/cuda/include/hl_tensor_ops.h
+++ /dev/null
@@ -1,536 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_TENSOR_OPS_H_
-#define HL_TENSOR_OPS_H_
-
-#include <cmath>
-#include "hl_matrix_type.cuh"
-
-namespace hppl {
-namespace unary {
-
-template <class T>
-class add_scale {
- private:
-  const T p;
-
- public:
-  INLINE add_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a + p; }
-};
-
-template <class T>
-class sub_scale {
- private:
-  const T p;
-
- public:
-  INLINE sub_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a - p; }
-};
-
-template <class T>
-class mul_scale {
- private:
-  const T p;
-
- public:
-  INLINE mul_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a * p; }
-};
-
-template <class T>
-class div_scale {
- private:
-  const T p;
-
- public:
-  INLINE div_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a / p; }
-};
-
-template <class T>
-class neg {
- public:
-  INLINE T operator()(const T a) const { return -a; }
-};
-
-template <class T>
-class exp_op {
- public:
-  INLINE T operator()(const T a) const { return std::exp(a); }
-};
-
-template <class T>
-class log_op {
- public:
-  INLINE T operator()(const T a) const { return std::log(a); }
-};
-
-template <class T>
-class sqrt_op {
- public:
-  INLINE T operator()(const T a) const { return std::sqrt(a); }
-};
-
-template <class T>
-class square {
- public:
-  INLINE T operator()(const T a) const { return a * a; }
-};
-
-template <class T>
-class reciprocal {
- public:
-  INLINE T operator()(const T a) const { return T(1) / a; }
-};
-
-template <class T>
-class abs {
- public:
-  INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
-};
-
-template <class T>
-class sign {
- public:
-  INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
-};
-
-template <class T>
-class min {
- private:
-  const T p;
-
- public:
-  INLINE min(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a > p ? p : a; }
-};
-
-template <class T>
-class max {
- private:
-  const T p;
-
- public:
-  INLINE max(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a < p ? p : a; }
-};
-
-template <class T>
-class pow_op {
- private:
-  const T p;
-
- public:
-  INLINE pow_op(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return std::pow(a, p); }
-};
-
-template <class T>
-class constant {
- private:
-  const T p;
-
- public:
-  INLINE constant(const T s) : p(s) {}
-  INLINE T operator()(int i) const { return p; }
-  INLINE T operator()(int i, int j) const { return p; }
-};
-
-template <class T>
-class cmp_eq {
- private:
-  const T p;
-
- public:
-  INLINE cmp_eq(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a == p; }
-};
-
-template <class T>
-class cmp_ne {
- private:
-  const T p;
-
- public:
-  INLINE cmp_ne(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a != p; }
-};
-
-template <class T>
-class cmp_le {
- private:
-  const T p;
-
- public:
-  INLINE cmp_le(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a <= p; }
-};
-
-template <class T>
-class cmp_lt {
- private:
-  const T p;
-
- public:
-  INLINE cmp_lt(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a < p; }
-};
-
-template <class T>
-class cmp_ge {
- private:
-  const T p;
-
- public:
-  INLINE cmp_ge(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a >= p; }
-};
-
-template <class T>
-class cmp_gt {
- private:
-  const T p;
-
- public:
-  INLINE cmp_gt(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a > p; }
-};
-
-template <class T>
-class and_op {
- private:
-  const T p;
-
- public:
-  INLINE and_op(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a && p; }
-};
-
-template <class T>
-class or_op {
- private:
-  const T p;
-
- public:
-  INLINE or_op(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a || p; }
-};
-
-}  // namespace unary
-
-namespace binary {
-template <class T>
-class add {
- public:
-  INLINE T operator()(const T a, const T b) const { return a + b; }
-};
-
-template <class T>
-class add_scale {
- private:
-  const T p1;
-  const T p2;
-
- public:
-  INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
-  INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
-};
-
-template <class T>
-class sub {
- public:
-  INLINE T operator()(const T a, const T b) const { return a - b; }
-};
-
-template <class T>
-class mul {
- public:
-  INLINE T operator()(const T a, const T b) const { return a * b; }
-};
-
-template <class T>
-class div {
- public:
-  INLINE T operator()(const T a, const T b) const { return a / b; }
-};
-
-template <class T>
-class cmp_eq {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a == b; }
-};
-
-template <class T>
-class cmp_ne {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a != b; }
-};
-
-template <class T>
-class cmp_le {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a <= b; }
-};
-
-template <class T>
-class cmp_lt {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a < b; }
-};
-
-template <class T>
-class cmp_ge {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a >= b; }
-};
-
-template <class T>
-class cmp_gt {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a > b; }
-};
-
-template <class T>
-class and_op {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a && b; }
-};
-
-template <class T>
-class or_op {
- public:
-  INLINE bool operator()(const T a, const T b) const { return a || b; }
-};
-
-template <class T>
-class min {
- public:
-  INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
-};
-
-template <class T>
-class max {
- public:
-  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
-};
-
-#ifdef PADDLE_USE_SSE3
-#ifndef PADDLE_TYPE_DOUBLE
-template <>
-class add<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(a, b);
-  }
-};
-
-template <>
-class add_scale<__m128> {
- private:
-  const __m128 p1;
-  const __m128 p2;
-
- public:
-  INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
-  }
-};
-
-template <>
-class sub<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_sub_ps(a, b);
-  }
-};
-
-template <>
-class mul<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_mul_ps(a, b);
-  }
-};
-
-template <>
-class div<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_div_ps(a, b);
-  }
-};
-
-template <>
-class min<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_min_ps(a, b);
-  }
-};
-
-template <>
-class max<__m128> {
- public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_max_ps(a, b);
-  }
-};
-#else
-template <>
-class add<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(a, b);
-  }
-};
-
-template <>
-class add_scale<__m128d> {
- private:
-  const __m128d p1;
-  const __m128d p2;
-
- public:
-  INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
-  }
-};
-
-template <>
-class sub<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_sub_pd(a, b);
-  }
-};
-
-template <>
-class mul<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_mul_pd(a, b);
-  }
-};
-
-template <>
-class div<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_div_pd(a, b);
-  }
-};
-
-template <>
-class min<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_min_pd(a, b);
-  }
-};
-
-template <>
-class max<__m128d> {
- public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_max_pd(a, b);
-  }
-};
-#endif  // PADDLE_TYPE_DOUBLE
-#endif  // PADDLE_USE_SSE3
-
-#ifdef PADDLE_USE_NEON
-#ifndef PADDLE_TYPE_DOUBLE
-template <>
-class add<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vaddq_f32(a, b);
-  }
-};
-
-template <>
-class add_scale<float32x4_t> {
- private:
-  const float32x4_t p1;
-  const float32x4_t p2;
-
- public:
-  INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
-      : p1(s1), p2(s2) {}
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b));
-  }
-};
-
-template <>
-class sub<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vsubq_f32(a, b);
-  }
-};
-
-template <>
-class mul<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vmulq_f32(a, b);
-  }
-};
-
-template <>
-class div<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    float32x4_t tmp = vrecpeq_f32(b);
-    return vmulq_f32(a, tmp);
-  }
-};
-
-template <>
-class min<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vminq_f32(a, b);
-  }
-};
-
-template <>
-class max<float32x4_t> {
- public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vmaxq_f32(a, b);
-  }
-};
-#else
-#error To be implemented
-#endif  // PADDLE_TYPE_DOUBLE
-#endif  // PADDLE_USE_NEON
-
-}  // namespace binary
-}  // namespace hppl
-
-#endif  // HL_TENSOR_OPS_H_
diff --git a/paddle/legacy/cuda/include/hl_thread.ph b/paddle/legacy/cuda/include/hl_thread.ph
deleted file mode 100644
index 4abede151..000000000
--- a/paddle/legacy/cuda/include/hl_thread.ph
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_THREAD_PH_
-#define HL_THREAD_PH_
-
-#include <stdio.h>
-#include <pthread.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
-#include "hl_base.h"
-
-/**
- * @brief   Thread resource structure.
- *
- * @param   stream[HPPL_STREAM_END] Stream for thread.
- * @param   handle                  Cublas Handle.
- * @param   gen                     Curand Generator.
- * @param   cudnn_handle            Cudnn handle.
- * @param   cudnn_desc              Cudnn image descriptor.
- * @param   *gen_mutex              Gen lock.
- * @param   *gpu_mem                HPPL GPU Memory.
- * @param   *cpu_mem                HPPL CPU Memory.
- * @param   event                   gpu_mem event.
- * @param   device                  Thread device context.
- * @param   major                   Compute capability.
- * @param   is_init                 Thread init or not.
- */
-typedef struct {
-    cudaStream_t             stream[HPPL_STREAM_END];
-    cublasHandle_t           handle;
-    curandGenerator_t        gen;
-    cudnnHandle_t            cudnn_handle;
-    cudnnTensorDescriptor_t  cudnn_desc;
-    pthread_mutex_t          *gen_mutex;
-    real                     *gpu_mem;
-    real                     *cpu_mem;
-    cudaEvent_t              event;
-    int                      device;
-    int                      major;
-    bool                     is_init;
-} _hl_thread_resource, *hl_thread_resource;
-
-extern __thread _hl_thread_resource t_resource;
-
-/**
- * @brief   Initialize cudnn.
- *
- * @param   cudnn_handle  Cudnn handle.
- * @param   stream        Cudnn stream.
- */
-extern void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream);
-
-/**
- * @brief   Initialize cublas.
- *
- * @param   cublas_handle  Cublas handle.
- * @param   stream         Cuda stream.
- */
-extern void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream);
-
-/**
- * @brief   Initialize cudnn tensor descriptor.
- *
- * @param   cudnn_desc    Cudnn tensor descriptor.
- */
-
-extern void hl_cudnn_desc_init(cudnnTensorDescriptor_t*  cudnn_desc);
-
-#endif  /* HL_THREAD_PH_ */
diff --git a/paddle/legacy/cuda/include/hl_time.h b/paddle/legacy/cuda/include/hl_time.h
deleted file mode 100644
index 61d80c065..000000000
--- a/paddle/legacy/cuda/include/hl_time.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_TIME_H_
-#define HL_TIME_H_
-#include <cstdint>
-/**
- * @brief   High resolution timer.
- *
- * @return  int64_t the representation value of the object as a
- *                  count of periods, which are not necessarily
- *                  seconds.
- *
- * @note    It is used to generate random perturbation parameters.
- */
-int64_t getCurrentTimeStick(void);
-
-#endif /* HL_TIME_H_ */
diff --git a/paddle/legacy/cuda/include/hl_top_k.h b/paddle/legacy/cuda/include/hl_top_k.h
deleted file mode 100644
index a3c7872f5..000000000
--- a/paddle/legacy/cuda/include/hl_top_k.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_TOP_K_H_
-#define HL_TOP_K_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   find top k element.
- *
- * @param[out]  topVal         top k element.
- * @param[in]   ldv            leading dimension of topVal.
- * @param[out]  topIds         top k index.
- * @param[in]   src            input value.
- * @param[in]   lds            leading dimension of src.
- * @param[in]   dim            width of input value.
- * @param[in]   beamSize       beam size.
- * @param[in]   numSamples     height of input value.
- *
- */
-extern void hl_matrix_top_k(real* topVal,
-                            int ldv,
-                            int* topIds,
-                            real* src,
-                            int lds,
-                            int dim,
-                            int beamSize,
-                            int numSamples);
-
-/**
- * @brief   find top k element for each row in sparse matrix.
- *
- * @param[out]  topVal         top k element.
- * @param[in]   ldv            leading dimension of topVal.
- * @param[out]  topIds         top k index.
- * @param[in]   src            sparse matrix.
- * @param[in]   beamSize       beam size.
- * @param[in]   numSamples     height of input value.
- *
- * @note    Only support HL_SPARSE_CSR format.
- */
-extern void hl_sparse_matrix_top_k(real* topVal,
-                                   int ldv,
-                                   int* topIds,
-                                   hl_sparse_matrix_s src,
-                                   int beamSize,
-                                   int numSamples);
-
-/**
- * @brief   Matrix classification error.
- *
- * @param[out]  topVal         top k element.
- * @param[in]   ldv            leading dimension of topVal.
- * @param[out]  topIds         top k index.
- * @param[in]   src            input value.
- * @param[in]   lds            leading dimension of src.
- * @param[in]   dim            width of input value.
- * @param[in]   topkSize       size of top k element.
- * @param[in]   numSamples     height of input value.
- * @param[in]   label          ground truth label.
- * @param[out]  recResult      top-k classification error.
- *
- */
-extern void hl_matrix_classification_error(real* topVal,
-                                           int ldv,
-                                           int* topIds,
-                                           real* src,
-                                           int lds,
-                                           int dim,
-                                           int topkSize,
-                                           int numSamples,
-                                           int* label,
-                                           real* recResult);
-
-#endif  // HL_TOP_K_H_
diff --git a/paddle/legacy/cuda/include/hl_warpctc_wrap.h b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
deleted file mode 100644
index 09cbd6d45..000000000
--- a/paddle/legacy/cuda/include/hl_warpctc_wrap.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#ifndef HL_WARPCTC_WRAP_H_
-#define HL_WARPCTC_WRAP_H_
-#include "ctc.h"
-#include "hl_base.h"
-
-typedef ctcStatus_t hl_warpctc_status_t;
-typedef ctcOptions hl_warpctc_options_t;
-
-/**
- * @brief Init ctc options.
- *
- * @param[in]   blank     blank label used in ctc loss function.
- * @param[in]   useGpu    whether use gpu.
- * @param[out]  options   handle to store cpu or gpu informations.
- *
- */
-extern void hl_warpctc_init(const size_t blank,
-                            bool useGpu,
-                            hl_warpctc_options_t* options);
-
-/**
- * @brief Compute the connectionist temporal classification loss,
- *        and optionally compute the gradient with respect to the inputs.
- *
- * if batchGrad == nullptr
- *
- *    only compute the ctc loss.
- *
- * if batchGrad != nullptr
- *
- *    compute both ctc loss and gradient.
- *
- * @param[in]   batchInput      batch matrix of input probabilities,
- *                              in maxSequenceLength x numSequence x numClasses
- *                              (row-major) format.
- * @param[out]  batchGrad       batch matrix of gradient.
- * @param[in]   cpuLabels       labels always in CPU memory.
- * @param[in]   cpuLabelLengths length of all labels in CPU memory.
- * @param[in]   cpuInputLengths length of all sequences in CPU memory.
- * @param[in]   numClasses      number of possible output symbols.
- * @param[in]   numSequences    number of sequence.
- * @param[out]  cpuCosts        cost of each sequence in CPU memory.
- * @param[out]  workspace       workspace to store some temporary results.
- * @param[in]   options         handle to store cpu or gpu informations.
- *
- */
-extern void hl_warpctc_compute_loss(const real* batchInput,
-                                    real* batchGrad,
-                                    const int* cpuLabels,
-                                    const int* cpuLabelLengths,
-                                    const int* cpuInputLengths,
-                                    const size_t numClasses,
-                                    const size_t numSequences,
-                                    real* cpuCosts,
-                                    void* workspace,
-                                    hl_warpctc_options_t* options);
-
-/**
- * @brief Compute the required workspace size.
- *        There is no memory allocated operations within warp-ctc.
- *
- * @param[in]   cpuLabelLengths length of all labels in CPU memory.
- * @param[in]   cpuInputLengths length of all sequences in CPU memory.
- * @param[in]   numClasses      number of possible output symbols.
- * @param[in]   numSequences    number of sequence.
- * @param[in]   options         handle to store cpu or gpu informations.
- * @param[out]  bytes           pointer to a scalar where the memory
- *                              requirement in bytes will be placed.
- *
- */
-extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
-                                          const int* cpuInputLengths,
-                                          const size_t numClasses,
-                                          const size_t numSequences,
-                                          hl_warpctc_options_t* options,
-                                          size_t* bytes);
-
-#endif  // HL_WARPCTC_WRAP_H_
-#endif
diff --git a/paddle/legacy/cuda/include/stub/hl_aggregate_stub.h b/paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
deleted file mode 100644
index 2ac841fac..000000000
--- a/paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_AGGREGATE_STUB_H_
-#define HL_AGGREGATE_STUB_H_
-
-#include "hl_aggregate.h"
-
-inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {}
-
-inline void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {}
-
-#endif  // HL_AGGREGATE_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_cnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cnn_stub.h
deleted file mode 100644
index 997eed62e..000000000
--- a/paddle/legacy/cuda/include/stub/hl_cnn_stub.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CNN_STUB_H_
-#define HL_CNN_STUB_H_
-
-#include "hl_cnn.h"
-
-inline void hl_maxpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               real* MaskData) {}
-
-inline void hl_maxpool_backward(const int frameCnt,
-                                const real* inputData,
-                                const real* outData,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                const int paddingH,
-                                const int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* targetGrad,
-                                const int outStride) {}
-
-inline void hl_avgpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               const bool excludeMode) {}
-
-inline void hl_avgpool_backward(const int frameCnt,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                int paddingH,
-                                int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* backGrad,
-                                const int outStride,
-                                const bool excludeMode) {}
-
-inline void hl_maxpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 real* maxPoolIdxData,
-                                 const int tgtStride) {}
-
-inline void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int paddingD,
-                                  const int paddingH,
-                                  const int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* targetGrad,
-                                  real* maxPoolIdxData,
-                                  const int outStride) {}
-
-inline void hl_avgpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 const int tgtStride) {}
-
-inline void hl_avgpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int paddingD,
-                                  const int paddingH,
-                                  const int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* backGrad,
-                                  const int outStride) {}
-
-inline void hl_bilinear_forward(const real* inData,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t inputH,
-                                const size_t inputW,
-                                real* outData,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t outputH,
-                                const size_t outputW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {}
-
-inline void hl_bilinear_backward(real* inGrad,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t inputH,
-                                 const size_t inputW,
-                                 const real* outGrad,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t outputH,
-                                 const size_t outputW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {}
-
-inline void hl_maxout_forward(const real* inData,
-                              real* outData,
-                              int* idData,
-                              size_t batchSize,
-                              size_t size,
-                              size_t featLen,
-                              size_t group) {}
-
-inline void hl_maxout_backward(real* inGrad,
-                               const real* outGrad,
-                               const int* idData,
-                               size_t batchSize,
-                               size_t size,
-                               size_t featLen,
-                               size_t group) {}
-
-inline void hl_upsample_forward(real* inputData,
-                                real* maskData,
-                                size_t batchSize,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW,
-                                real* outputData) {}
-
-inline void hl_upsample_backward(real* outputGradData,
-                                 real* maskData,
-                                 size_t batchSize,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 real* inputGradData) {}
-
-#endif  // HL_CNN_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
deleted file mode 100644
index 0b2300cda..000000000
--- a/paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUBLAS_STUB_H_
-#define HL_CUDA_CUBLAS_STUB_H_
-
-#include "hl_cuda_cublas.h"
-
-inline void hl_matrix_transpose(
-    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {}
-
-inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {}
-
-inline void hl_matrix_inverse(
-    real *A_d, real *C_d, int dimN, int lda, int ldc) {}
-
-inline void hl_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta,
-                          int lda,
-                          int ldb,
-                          int ldc) {}
-
-inline void hl_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta) {}
-
-#endif  // HL_CUDA_CUBLAS_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
deleted file mode 100644
index 4b8bdf750..000000000
--- a/paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_CUDNN_STUB_H_
-#define HL_CUDA_CUDNN_STUB_H_
-
-#include "hl_cuda_cudnn.h"
-
-inline int hl_get_cudnn_lib_version() { return 0; }
-
-inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {}
-
-inline void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                              int batch_size,
-                              int feature_maps,
-                              int height,
-                              int width) {}
-
-inline void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                              int batch_size,
-                              int feature_maps,
-                              int height,
-                              int width,
-                              int nStride,
-                              int cStride,
-                              int hStride,
-                              int wStride) {}
-
-inline void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {}
-
-inline void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
-                                         hl_pooling_mode_t mode,
-                                         int height,
-                                         int width,
-                                         int height_padding,
-                                         int width_padding,
-                                         int stride_height,
-                                         int stride_width) {}
-
-inline void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {}
-
-inline void hl_pooling_forward(hl_tensor_descriptor input,
-                               real* input_image,
-                               hl_tensor_descriptor output,
-                               real* output_image,
-                               hl_pooling_descriptor pooling) {}
-
-inline void hl_pooling_backward(hl_tensor_descriptor input,
-                                real* input_image,
-                                real* input_image_grad,
-                                hl_tensor_descriptor output,
-                                real* output_image,
-                                real* output_image_grad,
-                                hl_pooling_descriptor pooling) {}
-
-inline void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                        int input_feature_maps,
-                                        int output_feature_maps,
-                                        int height,
-                                        int width) {}
-
-inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {}
-
-inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-                                             hl_tensor_descriptor image,
-                                             hl_filter_descriptor filter,
-                                             int padding_height,
-                                             int padding_width,
-                                             int stride_height,
-                                             int stride_width,
-                                             int dilation_h,
-                                             int dilation_w) {}
-
-inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-                                            hl_tensor_descriptor image,
-                                            hl_filter_descriptor filter,
-                                            int padding_height,
-                                            int padding_width,
-                                            int stride_height,
-                                            int stride_width,
-                                            int dilation_h,
-                                            int dilation_w) {}
-
-inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
-
-inline void hl_conv_workspace(hl_tensor_descriptor input,
-                              hl_tensor_descriptor output,
-                              hl_filter_descriptor filter,
-                              hl_convolution_descriptor conv,
-                              int* convFwdAlgo,
-                              size_t* fwdLimitBytes,
-                              int* convBwdDataAlgo,
-                              size_t* bwdDataLimitBytes,
-                              int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes,
-                              bool useDilation) {}
-
-inline void hl_convolution_forward(hl_tensor_descriptor input,
-                                   real* input_data,
-                                   hl_tensor_descriptor output,
-                                   real* output_data,
-                                   hl_filter_descriptor filter,
-                                   real* filter_data,
-                                   hl_convolution_descriptor conv,
-                                   void* gpuWorkSpace,
-                                   size_t sizeInBytes,
-                                   int convFwdAlgo) {}
-
-inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-                                            real* bias_data,
-                                            hl_tensor_descriptor output,
-                                            real* output_data) {}
-
-inline void hl_convolution_backward_filter(hl_tensor_descriptor input,
-                                           real* input_data,
-                                           hl_tensor_descriptor output,
-                                           real* output_grad_data,
-                                           hl_filter_descriptor filter,
-                                           real* filter_grad_data,
-                                           hl_convolution_descriptor conv,
-                                           void* gpuWorkSpace,
-                                           size_t sizeInBytes,
-                                           int convBwdFilterAlgo) {}
-
-inline void hl_convolution_backward_data(hl_tensor_descriptor input,
-                                         real* input_data_grad,
-                                         hl_tensor_descriptor output,
-                                         real* output_grad_data,
-                                         hl_filter_descriptor filter,
-                                         real* filter_data,
-                                         hl_convolution_descriptor conv,
-                                         void* gpuWorkSpace,
-                                         size_t sizeInBytes,
-                                         int convBwdDataAlgo) {}
-
-inline void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                         real* bias_grad_data,
-                                         hl_tensor_descriptor output,
-                                         real* output_grad_data) {}
-
-inline void hl_softmax_forward(real* input,
-                               real* output,
-                               int height,
-                               int width) {}
-
-inline void hl_softmax_backward(real* output_value,
-                                real* output_grad,
-                                int height,
-                                int width) {}
-
-inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real* input,
-                                           hl_tensor_descriptor outputDesc,
-                                           real* output,
-                                           hl_tensor_descriptor bnParamDesc,
-                                           real* scale,
-                                           real* bias,
-                                           double factor,
-                                           real* runningMean,
-                                           real* runningInvVar,
-                                           double epsilon,
-                                           real* savedMean,
-                                           real* savedVar) {}
-
-inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real* input,
-                                            hl_tensor_descriptor outputDesc,
-                                            real* output,
-                                            hl_tensor_descriptor bnParamDesc,
-                                            real* scale,
-                                            real* bias,
-                                            real* estimatedMean,
-                                            real* estimatedVar,
-                                            double epsilon) {}
-
-inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real* input,
-                                   hl_tensor_descriptor outGradDesc,
-                                   real* outGrad,
-                                   hl_tensor_descriptor inGradDesc,
-                                   real* inGrad,
-                                   hl_tensor_descriptor dBnParamDesc,
-                                   real* scale,
-                                   real* scaleGrad,
-                                   real* biasGrad,
-                                   double epsilon,
-                                   real* savedMean,
-                                   real* savedInvVar) {}
-
-#endif  // HL_CUDA_CUDNN_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_cuda_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_stub.h
deleted file mode 100644
index ac8b22ef3..000000000
--- a/paddle/legacy/cuda/include/stub/hl_cuda_stub.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CUDA_STUB_H_
-#define HL_CUDA_STUB_H_
-
-#include "hl_cuda.h"
-
-inline void hl_start() {}
-
-inline void hl_specify_devices_start(int *device, int number) {}
-
-inline void hl_init(int device) {}
-
-inline int hl_get_cuda_lib_version(int device) { return 0; }
-
-inline void hl_fini() {}
-
-inline void hl_set_sync_flag(bool flag) {}
-
-inline bool hl_get_sync_flag() { return false; }
-
-inline int hl_get_device_count() { return 0; }
-
-inline void hl_set_device(int device) {}
-
-inline int hl_get_device() { return 0; }
-
-inline void *hl_malloc_device(size_t size) { return NULL; }
-
-inline void hl_free_mem_device(void *dest_d) {}
-
-inline void *hl_malloc_host(size_t size) { return NULL; }
-
-inline void hl_free_mem_host(void *dest_h) {}
-
-inline void hl_memcpy(void *dst, void *src, size_t size) {}
-
-inline void hl_memset_device(void *dest_d, int value, size_t size) {}
-
-inline void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {}
-
-inline void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {}
-
-inline void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {}
-
-inline void hl_rand(real *dest_d, size_t num) {}
-
-inline void hl_srand(unsigned int seed) {}
-
-inline void hl_memcpy_async(void *dst,
-                            void *src,
-                            size_t size,
-                            hl_stream_t stream) {}
-
-inline void hl_stream_synchronize(hl_stream_t stream) {}
-
-inline void hl_create_event(hl_event_t *event) {}
-
-inline void hl_destroy_event(hl_event_t event) {}
-
-inline float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
-  return 0;
-}
-
-inline void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {}
-
-inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {}
-
-inline void hl_event_synchronize(hl_event_t event) {}
-
-inline int hl_get_device_last_error() { return 0; }
-
-inline const char *hl_get_device_error_string() { return NULL; }
-
-inline const char *hl_get_device_error_string(size_t err) { return NULL; }
-
-inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }
-
-inline void hl_device_synchronize() {}
-
-inline void hl_profiler_start() {}
-
-inline void hl_profiler_end() {}
-
-#endif  // HL_CUDA_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_lstm_stub.h b/paddle/legacy/cuda/include/stub/hl_lstm_stub.h
deleted file mode 100644
index be2b71787..000000000
--- a/paddle/legacy/cuda/include/stub/hl_lstm_stub.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_LSTM_STUB_H_
-#define HL_LSTM_STUB_H_
-
-#include "hl_lstm.h"
-
-inline void hl_lstm_parallel_forward(real *gateValue,
-                                     real *stateValue,
-                                     real *preOutputValue,
-                                     real *outputValue,
-                                     real *checkIg,
-                                     real *checkFg,
-                                     real *checkOg,
-                                     real *weight,
-                                     const int *sequence,
-                                     int frameSize,
-                                     int numSequences,
-                                     bool reversed,
-                                     hl_activation_mode_t active_node,
-                                     hl_activation_mode_t active_gate,
-                                     hl_activation_mode_t active_state) {}
-
-inline void hl_lstm_parallel_backward_data(real *gateValue,
-                                           real *gateGrad,
-                                           real *stateValue,
-                                           real *stateGrad,
-                                           real *preOutputValue,
-                                           real *preOutputGrad,
-                                           real *outputGrad,
-                                           real *checkIg,
-                                           real *checkIgGrad,
-                                           real *checkFg,
-                                           real *checkFgGrad,
-                                           real *checkOg,
-                                           real *checkOgGrad,
-                                           real *weight,
-                                           const int *sequence,
-                                           int frameSize,
-                                           int numSequences,
-                                           bool reversed,
-                                           hl_activation_mode_t active_node,
-                                           hl_activation_mode_t active_gate,
-                                           hl_activation_mode_t active_state) {}
-
-inline void hl_lstm_parallel_backward_weight(real *weightGrad,
-                                             real *outputValue,
-                                             real *gateGrad,
-                                             const int *sequence,
-                                             int frameSize,
-                                             int batchSize,
-                                             int numSequences,
-                                             bool reversed) {}
-
-#endif  // HL_LSTM_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_matrix_stub.h b/paddle/legacy/cuda/include/stub/hl_matrix_stub.h
deleted file mode 100644
index 914a2edaf..000000000
--- a/paddle/legacy/cuda/include/stub/hl_matrix_stub.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_MATRIX_STUB_H_
-#define HL_MATRIX_STUB_H_
-
-#include "hl_matrix.h"
-
-inline void hl_matrix_add(real* A_d,
-                          real* B_d,
-                          real* C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta) {}
-
-inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {}
-
-inline void hl_sequence_softmax_forward(real* A_d,
-                                        real* C_d,
-                                        const int* index,
-                                        int numSequence) {}
-
-inline void hl_matrix_softmax_derivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
-
-inline void hl_matrix_classification_error(real* topVal,
-                                           int ldv,
-                                           int* topIds,
-                                           real* src,
-                                           int lds,
-                                           int dim,
-                                           int topkSize,
-                                           int numSamples,
-                                           int* label,
-                                           real* recResult) {}
-
-inline void hl_matrix_cross_entropy(
-    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
-
-inline void hl_matrix_cross_entropy_bp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy(
-    real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {}
-
-inline void hl_matrix_multi_binary_cross_entropy_bp(
-    real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {}
-
-inline void hl_matrix_zero_mem(real* data, int num) {}
-
-inline void hl_param_relu_forward(real* output,
-                                  real* input,
-                                  real* w,
-                                  int width,
-                                  int height,
-                                  int partial_sum) {}
-
-inline void hl_param_relu_backward_w(real* grad_w,
-                                     real* grad_o,
-                                     real* input,
-                                     int width,
-                                     int height,
-                                     int partial_sum) {}
-
-inline void hl_param_relu_backward_diff(real* grad_o,
-                                        real* input,
-                                        real* w,
-                                        real* diff,
-                                        int width,
-                                        int height,
-                                        int partial_sum) {}
-
-inline void hl_matrix_add_shared_bias(real* A_d,
-                                      real* B_d,
-                                      const int channel,
-                                      const int dimM,
-                                      const int dimN,
-                                      real scale) {}
-
-inline void hl_matrix_collect_shared_bias(real* B_d,
-                                          real* A_d,
-                                          const int channel,
-                                          const int dimM,
-                                          const int dimN,
-                                          real scale) {}
-
-inline void hl_matrix_rotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
-
-inline void hl_matrix_vol2Col(const real* dataSrc,
-                              int channels,
-                              int depth,
-                              int height,
-                              int width,
-                              int filterD,
-                              int filterH,
-                              int filterW,
-                              int strideD,
-                              int strideH,
-                              int strideW,
-                              int paddingD,
-                              int paddingH,
-                              int paddingW,
-                              real* dataDst) {}
-
-inline void hl_matrix_col2Vol(real* dataDst,
-                              int channels,
-                              int depth,
-                              int height,
-                              int width,
-                              int filterD,
-                              int filterH,
-                              int filterW,
-                              int strideD,
-                              int strideH,
-                              int strideW,
-                              int paddingD,
-                              int paddingH,
-                              int paddingW,
-                              const real* dataSrc,
-                              real alpha,
-                              real beta) {}
-
-inline void hl_vector_cast2int(int* out, real* vec, int size) {}
-
-#endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_sequence_stub.h b/paddle/legacy/cuda/include/stub/hl_sequence_stub.h
deleted file mode 100644
index 44bc3dbaf..000000000
--- a/paddle/legacy/cuda/include/stub/hl_sequence_stub.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_SEQUENCE_STUB_H_
-#define HL_SEQUENCE_STUB_H_
-
-#include "hl_sequence.h"
-
-inline void hl_max_sequence_forward(real* input,
-                                    const int* sequence,
-                                    real* output,
-                                    int* index,
-                                    int numSequences,
-                                    int dim) {}
-
-inline void hl_max_sequence_backward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
-
-inline void hl_sequence2batch_copy(real* batch,
-                                   real* sequence,
-                                   const int* batchIndex,
-                                   int seqWidth,
-                                   int batchCount,
-                                   bool seq2batch) {}
-
-inline void hl_sequence2batch_add(real* batch,
-                                  real* sequence,
-                                  int* batchIndex,
-                                  int seqWidth,
-                                  int batchCount,
-                                  bool seq2batch) {}
-
-inline void hl_sequence2batch_copy_padding(real* batch,
-                                           real* sequence,
-                                           const int* sequenceStartPositions,
-                                           const size_t sequenceWidth,
-                                           const size_t maxSequenceLength,
-                                           const size_t numSequences,
-                                           bool normByTimes,
-                                           bool seq2batch) {}
-
-inline void hl_sequence_avg_forward(real* dst,
-                                    real* src,
-                                    const int* starts,
-                                    int height,
-                                    int width,
-                                    const int mode) {}
-
-inline void hl_sequence_avg_backward(real* dst,
-                                     real* src,
-                                     const int* starts,
-                                     int height,
-                                     int width,
-                                     const int mode) {}
-#endif  // HL_SEQUENCE_STUB_H_
diff --git a/paddle/legacy/cuda/include/stub/hl_sparse_stub.h b/paddle/legacy/cuda/include/stub/hl_sparse_stub.h
deleted file mode 100644
index 4001d4fb7..000000000
--- a/paddle/legacy/cuda/include/stub/hl_sparse_stub.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_SPARSE_STUB_H_
-#define HL_SPARSE_STUB_H_
-
-#include "hl_sparse.h"
-
-inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                    hl_matrix_format_t format,
-                                    hl_matrix_value_t value_type,
-                                    int dimM,
-                                    int dimN,
-                                    int nnz) {}
-
-inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {}
-
-inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       void *dest_d,
-                                       size_t size,
-                                       hl_matrix_format_t format,
-                                       hl_matrix_value_t value_type,
-                                       int dimM,
-                                       int dimN,
-                                       int nnz) {}
-
-inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                       real *value_d,
-                                       int *rows_d,
-                                       int *cols_d,
-                                       hl_matrix_format_t format,
-                                       hl_matrix_value_t value_type,
-                                       int dimM,
-                                       int dimN,
-                                       int nnz) {}
-
-inline void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {}
-
-inline void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
-                                 real *csr_val,
-                                 int *csr_row,
-                                 int *csr_col,
-                                 hl_stream_t stream) {}
-
-inline void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
-                                 real *csc_val,
-                                 int *csc_row,
-                                 int *csc_col,
-                                 hl_stream_t stream) {}
-
-inline void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
-                                    hl_sparse_matrix_s src,
-                                    hl_stream_t stream) {}
-
-inline void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN) {}
-
-inline void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
-                                real *C_d,
-                                int dimM,
-                                int dimN) {}
-
-inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
-                                    hl_trans_op_t transa,
-                                    real *B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta) {}
-
-inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
-                                    hl_trans_op_t transa,
-                                    real *B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta) {}
-
-inline void hl_matrix_dense_mul_csc(real *A_d,
-                                    hl_trans_op_t transa,
-                                    hl_sparse_matrix_s B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta) {}
-
-inline void hl_sparse_matrix_mul(real *A_d,
-                                 hl_trans_op_t transa,
-                                 real *B_d,
-                                 hl_trans_op_t transb,
-                                 hl_sparse_matrix_s C_d,
-                                 int dimM,
-                                 int dimN,
-                                 int dimK,
-                                 real alpha,
-                                 real beta) {}
-
-inline void hl_matrix_dense_mul_csr(real *A_d,
-                                    hl_trans_op_t transa,
-                                    hl_sparse_matrix_s B_d,
-                                    hl_trans_op_t transb,
-                                    real *C_d,
-                                    int dimM,
-                                    int dimN,
-                                    int dimK,
-                                    real alpha,
-                                    real beta) {}
-
-inline void hl_memcpy_from_csc_matrix(real *csc_val,
-                                      size_t val_size,
-                                      int *csc_row,
-                                      size_t row_size,
-                                      int *csc_col,
-                                      size_t col_size,
-                                      hl_sparse_matrix_s csc_matrix,
-                                      hl_stream_t stream) {}
-
-inline void hl_memcpy_from_csr_matrix(real *csr_val,
-                                      size_t val_size,
-                                      int *csr_row,
-                                      size_t row_size,
-                                      int *csr_col,
-                                      size_t col_size,
-                                      hl_sparse_matrix_s csr_matrix,
-                                      hl_stream_t stream) {}
-
-inline void hl_sparse_matrix_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
-
-inline void hl_matrix_csr_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {}
-
-inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                                      real *B_d,
-                                      real scale) {}
-
-inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d,
-                                   real *B_d,
-                                   real scale) {}
-
-inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                       real *B_d,
-                                       int dimM,
-                                       int dimN,
-                                       real alpha,
-                                       real beta) {}
-
-inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                                    real *B_d,
-                                    int dimM,
-                                    int dimN,
-                                    real alpha,
-                                    real beta) {}
-
-inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; }
-
-inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; }
-
-inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
-  return NULL;
-}
-
-#endif  // HL_SPARSE_STUB_H_
diff --git a/paddle/legacy/cuda/src/avx_mathfun.h b/paddle/legacy/cuda/src/avx_mathfun.h
deleted file mode 100644
index 8e698e746..000000000
--- a/paddle/legacy/cuda/src/avx_mathfun.h
+++ /dev/null
@@ -1,735 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-/*
-   AVX implementation of sin, cos, sincos, exp and log
-
-   Based on "sse_mathfun.h", by Julien Pommier
-   http://gruntthepeon.free.fr/ssemath/
-
-   Copyright (C) 2012 Giovanni Garberoglio
-   Interdisciplinary Laboratory for Computational Science (LISC)
-   Fondazione Bruno Kessler and University of Trento
-   via Sommarive, 18
-   I-38123 Trento (Italy)
-
-  This software is provided 'as-is', without any express or implied
-  warranty.  In no event will the authors be held liable for any damages
-  arising from the use of this software.
-
-  Permission is granted to anyone to use this software for any purpose,
-  including commercial applications, and to alter it and redistribute it
-  freely, subject to the following restrictions:
-
-  1. The origin of this software must not be misrepresented; you must not
-     claim that you wrote the original software. If you use this software
-     in a product, an acknowledgment in the product documentation would be
-     appreciated but is not required.
-  2. Altered source versions must be plainly marked as such, and must not be
-     misrepresented as being the original software.
-  3. This notice may not be removed or altered from any source distribution.
-
-  (this is the zlib license)
-*/
-
-#include <immintrin.h>
-
-/* yes I know, the top of this file is quite ugly */
-#define ALIGN32_BEG
-#define ALIGN32_END __attribute__((aligned(32)))
-
-/* __m128 is ugly to write */
-typedef __m256 v8sf;   // vector of 8 float (avx)
-typedef __m256i v8si;  // vector of 8 int   (avx)
-typedef __m128i v4si;  // vector of 8 int   (avx)
-
-#define _PI32AVX_CONST(Name, Val)                                 \
-  static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \
-      Val, Val, Val, Val}
-
-_PI32AVX_CONST(1, 1);
-_PI32AVX_CONST(inv1, ~1);
-_PI32AVX_CONST(2, 2);
-_PI32AVX_CONST(4, 4);
-
-/* declare some AVX constants -- why can't I figure a better way to do that? */
-#define _PS256_CONST(Name, Val)                                   \
-  static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-#define _PI32_CONST256(Name, Val)                                  \
-  static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-#define _PS256_CONST_TYPE(Name, Type, Val)                       \
-  static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \
-      Val, Val, Val, Val, Val, Val, Val, Val}
-
-_PS256_CONST(1, 1.0f);
-_PS256_CONST(0p5, 0.5f);
-/* the smallest non denormalized float number */
-_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000);
-_PS256_CONST_TYPE(mant_mask, int, 0x7f800000);
-_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
-
-_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000);
-_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
-
-_PI32_CONST256(0, 0);
-_PI32_CONST256(1, 1);
-_PI32_CONST256(inv1, ~1);
-_PI32_CONST256(2, 2);
-_PI32_CONST256(4, 4);
-_PI32_CONST256(0x7f, 0x7f);
-
-_PS256_CONST(cephes_SQRTHF, 0.707106781186547524);
-_PS256_CONST(cephes_log_p0, 7.0376836292E-2);
-_PS256_CONST(cephes_log_p1, -1.1514610310E-1);
-_PS256_CONST(cephes_log_p2, 1.1676998740E-1);
-_PS256_CONST(cephes_log_p3, -1.2420140846E-1);
-_PS256_CONST(cephes_log_p4, +1.4249322787E-1);
-_PS256_CONST(cephes_log_p5, -1.6668057665E-1);
-_PS256_CONST(cephes_log_p6, +2.0000714765E-1);
-_PS256_CONST(cephes_log_p7, -2.4999993993E-1);
-_PS256_CONST(cephes_log_p8, +3.3333331174E-1);
-_PS256_CONST(cephes_log_q1, -2.12194440e-4);
-_PS256_CONST(cephes_log_q2, 0.693359375);
-
-#ifndef __AVX2__
-
-typedef union imm_xmm_union {
-  v8si imm;
-  v4si xmm[2];
-} imm_xmm_union;
-
-#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_)       \
-  {                                               \
-    imm_xmm_union u __attribute__((aligned(32))); \
-    u.imm = imm_;                                 \
-    xmm0_ = u.xmm[0];                             \
-    xmm1_ = u.xmm[1];                             \
-  }
-
-#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_)       \
-  {                                               \
-    imm_xmm_union u __attribute__((aligned(32))); \
-    u.xmm[0] = xmm0_;                             \
-    u.xmm[1] = xmm1_;                             \
-    imm_ = u.imm;                                 \
-  }
-
-#define AVX2_BITOP_USING_SSE2(fn)                        \
-  static inline v8si avx2_mm256_##fn(v8si x, int a) {    \
-    /* use SSE2 instruction to perform the bitop AVX2 */ \
-    v4si x1, x2;                                         \
-    v8si ret;                                            \
-    COPY_IMM_TO_XMM(x, x1, x2);                          \
-    x1 = _mm_##fn(x1, a);                                \
-    x2 = _mm_##fn(x2, a);                                \
-    COPY_XMM_TO_IMM(x1, x2, ret);                        \
-    return (ret);                                        \
-  }
-
-//#warning "Using SSE2 to perform AVX2 bitshift ops"
-AVX2_BITOP_USING_SSE2(slli_epi32)
-AVX2_BITOP_USING_SSE2(srli_epi32)
-
-#define AVX2_INTOP_USING_SSE2(fn)                                     \
-  static inline v8si avx2_mm256_##fn(v8si x, v8si y) {                \
-    /* use SSE2 instructions to perform the AVX2 integer operation */ \
-    v4si x1, x2;                                                      \
-    v4si y1, y2;                                                      \
-    v8si ret;                                                         \
-    COPY_IMM_TO_XMM(x, x1, x2);                                       \
-    COPY_IMM_TO_XMM(y, y1, y2);                                       \
-    x1 = _mm_##fn(x1, y1);                                            \
-    x2 = _mm_##fn(x2, y2);                                            \
-    COPY_XMM_TO_IMM(x1, x2, ret);                                     \
-    return (ret);                                                     \
-  }
-
-//#warning "Using SSE2 to perform AVX2 integer ops"
-AVX2_INTOP_USING_SSE2(and_si128)
-AVX2_INTOP_USING_SSE2(andnot_si128)
-AVX2_INTOP_USING_SSE2(cmpeq_epi32)
-AVX2_INTOP_USING_SSE2(sub_epi32)
-AVX2_INTOP_USING_SSE2(add_epi32)
-#define avx2_mm256_and_si256 avx2_mm256_and_si128
-#define avx2_mm256_andnot_si256 avx2_mm256_andnot_si128
-#else
-#define avx2_mm256_slli_epi32 _mm256_slli_epi32
-#define avx2_mm256_srli_epi32 _mm256_srli_epi32
-#define avx2_mm256_and_si256 _mm256_and_si256
-#define avx2_mm256_andnot_si256 _mm256_andnot_si256
-#define avx2_mm256_cmpeq_epi32 _mm256_cmpeq_epi32
-#define avx2_mm256_sub_epi32 _mm256_sub_epi32
-#define avx2_mm256_add_epi32 _mm256_add_epi32
-#endif /* __AVX2__ */
-
-/* natural logarithm computed for 8 simultaneous float
-   return NaN for x <= 0
-*/
-v8sf log256_ps(v8sf x) {
-  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
-
-  // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
-  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
-
-  x = _mm256_max_ps(
-      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
-
-  // can be done with AVX2
-  imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
-
-  /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
-
-  // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
-  v8sf e = _mm256_cvtepi32_ps(imm0);
-
-  e = _mm256_add_ps(e, one);
-
-  /* part2:
-     if( x < SQRTHF ) {
-       e -= 1;
-       x = x + x - 1.0;
-     } else { x = x - 1.0; }
-  */
-  // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
-  v8sf tmp = _mm256_and_ps(x, mask);
-  x = _mm256_sub_ps(x, one);
-  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
-  x = _mm256_add_ps(x, tmp);
-
-  v8sf z = _mm256_mul_ps(x, x);
-
-  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
-  y = _mm256_mul_ps(y, x);
-
-  y = _mm256_mul_ps(y, z);
-
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
-  y = _mm256_add_ps(y, tmp);
-
-  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
-  x = _mm256_add_ps(x, y);
-  x = _mm256_add_ps(x, tmp);
-  x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
-  return x;
-}
-
-_PS256_CONST(exp_hi, 88.3762626647949f);
-_PS256_CONST(exp_lo, -88.3762626647949f);
-
-_PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
-_PS256_CONST(cephes_exp_C1, 0.693359375);
-_PS256_CONST(cephes_exp_C2, -2.12194440e-4);
-
-_PS256_CONST(cephes_exp_p0, 1.9875691500E-4);
-_PS256_CONST(cephes_exp_p1, 1.3981999507E-3);
-_PS256_CONST(cephes_exp_p2, 8.3334519073E-3);
-_PS256_CONST(cephes_exp_p3, 4.1665795894E-2);
-_PS256_CONST(cephes_exp_p4, 1.6666665459E-1);
-_PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
-
-v8sf exp256_ps(v8sf x) {
-  v8sf tmp = _mm256_setzero_ps(), fx;
-  v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
-
-  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
-
-  /* how to perform a floorf with SSE: just below */
-  // imm0 = _mm256_cvttps_epi32(fx);
-  // tmp  = _mm256_cvtepi32_ps(imm0);
-
-  tmp = _mm256_floor_ps(fx);
-
-  /* if greater, substract 1 */
-  // v8sf mask = _mm256_cmpgt_ps(tmp, fx);
-  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);
-  mask = _mm256_and_ps(mask, one);
-  fx = _mm256_sub_ps(tmp, mask);
-
-  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
-  x = _mm256_sub_ps(x, tmp);
-  x = _mm256_sub_ps(x, z);
-
-  z = _mm256_mul_ps(x, x);
-
-  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
-  y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, x);
-  y = _mm256_add_ps(y, one);
-
-  /* build 2^n */
-  imm0 = _mm256_cvttps_epi32(fx);
-  // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
-  imm0 = avx2_mm256_slli_epi32(imm0, 23);
-  v8sf pow2n = _mm256_castsi256_ps(imm0);
-  y = _mm256_mul_ps(y, pow2n);
-  return y;
-}
-
-_PS256_CONST(minus_cephes_DP1, -0.78515625);
-_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4);
-_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8);
-_PS256_CONST(sincof_p0, -1.9515295891E-4);
-_PS256_CONST(sincof_p1, 8.3321608736E-3);
-_PS256_CONST(sincof_p2, -1.6666654611E-1);
-_PS256_CONST(coscof_p0, 2.443315711809948E-005);
-_PS256_CONST(coscof_p1, -1.388731625493765E-003);
-_PS256_CONST(coscof_p2, 4.166664568298827E-002);
-_PS256_CONST(cephes_FOPI, 1.27323954473516);  // 4 / M_PI
-
-/* evaluation of 8 sines at onces using AVX intrisics
-
-   The code is the exact rewriting of the cephes sinf function.
-   Precision is excellent as long as x < 8192 (I did not bother to
-   take into account the special handling they have for greater values
-   -- it does not return garbage for arguments over 8192, though, but
-   the extra precision is missing).
-
-   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-   surprising but correct result.
-
-*/
-v8sf sin256_ps(v8sf x) {  // any x
-  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
-  v8si imm0, imm2;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-#endif
-
-  sign_bit = x;
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-/*
-  Here we start a series of integer operations, which are in the
-  realm of AVX2.
-  If we don't have AVX, let's perform them using SSE2 directives
-*/
-
-#ifdef __AVX2__
-  /* store the integer part of y in mm0 */
-  imm2 = _mm256_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-#else
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-
-  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x, x);
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
-  y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y, y2);
-  /* update the sign */
-  y = _mm256_xor_ps(y, sign_bit);
-
-  return y;
-}
-
-/* almost the same as sin_ps */
-v8sf cos256_ps(v8sf x) {  // any x
-  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
-  v8si imm0, imm2;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-#endif
-
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-#ifdef __AVX2__
-  /* store the integer part of y in mm0 */
-  imm2 = _mm256_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-  y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
-
-  /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-#else
-
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-
-  v8sf sign_bit = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
-  v8sf z = _mm256_mul_ps(x, x);
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  y2 = _mm256_and_ps(xmm3, y2);  //, xmm3);
-  y = _mm256_andnot_ps(xmm3, y);
-  y = _mm256_add_ps(y, y2);
-  /* update the sign */
-  y = _mm256_xor_ps(y, sign_bit);
-
-  return y;
-}
-
-/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could
-   replace both of them..
-   it is almost as fast, and gives you a free cosine with your sine */
-void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
-  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
-  v8si imm0, imm2, imm4;
-
-#ifndef __AVX2__
-  v4si imm0_1, imm0_2;
-  v4si imm2_1, imm2_2;
-  v4si imm4_1, imm4_2;
-#endif
-
-  sign_bit_sin = x;
-  /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
-  /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
-
-  /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
-
-#ifdef __AVX2__
-  /* store the integer part of y in imm2 */
-  imm2 = _mm256_cvttps_epi32(y);
-
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
-
-  y = _mm256_cvtepi32_ps(imm2);
-  imm4 = imm2;
-
-  /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
-  imm0 = avx2_mm256_slli_epi32(imm0, 29);
-  // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
-
-  /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
-// v8sf poly_mask = _mm256_castsi256_ps(imm2);
-#else
-  /* we use SSE2 routines to perform the integer ops */
-  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
-
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-  y = _mm256_cvtepi32_ps(imm2);
-
-  imm4_1 = imm2_1;
-  imm4_2 = imm2_2;
-
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
-
-  imm0_1 = _mm_slli_epi32(imm0_1, 29);
-  imm0_2 = _mm_slli_epi32(imm0_2, 29);
-
-  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
-
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
-
-  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
-  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
-
-  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
-#endif
-  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
-  v8sf poly_mask = _mm256_castsi256_ps(imm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
-  xmm1 = _mm256_mul_ps(y, xmm1);
-  xmm2 = _mm256_mul_ps(y, xmm2);
-  xmm3 = _mm256_mul_ps(y, xmm3);
-  x = _mm256_add_ps(x, xmm1);
-  x = _mm256_add_ps(x, xmm2);
-  x = _mm256_add_ps(x, xmm3);
-
-#ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
-  imm4 = avx2_mm256_slli_epi32(imm4, 29);
-#else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
-
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
-
-  imm4_1 = _mm_slli_epi32(imm4_1, 29);
-  imm4_2 = _mm_slli_epi32(imm4_2, 29);
-
-  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
-#endif
-
-  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);
-
-  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  v8sf z = _mm256_mul_ps(x, x);
-  y = *(v8sf *)_ps256_coscof_p0;
-
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
-  y = _mm256_mul_ps(y, z);
-  y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
-  y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
-  y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_mul_ps(y2, x);
-  y2 = _mm256_add_ps(y2, x);
-
-  /* select the correct result from the two polynoms */
-  xmm3 = poly_mask;
-  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
-  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
-  y2 = _mm256_sub_ps(y2, ysin2);
-  y = _mm256_sub_ps(y, ysin1);
-
-  xmm1 = _mm256_add_ps(ysin1, ysin2);
-  xmm2 = _mm256_add_ps(y, y2);
-
-  /* update the sign */
-  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
-  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
-}
diff --git a/paddle/legacy/cuda/src/hl_avx_functions.cc b/paddle/legacy/cuda/src/hl_avx_functions.cc
deleted file mode 100644
index 6fb7c9dd0..000000000
--- a/paddle/legacy/cuda/src/hl_avx_functions.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <immintrin.h>
-#include "hl_functions.h"
-
-namespace hppl {
-
-extern __m256 exp(__m256 a);
-
-__m256 relu(const __m256 a) {
-  __m256 tmp = _mm256_set1_ps(0.0f);
-  return _mm256_max_ps(a, tmp);
-}
-
-__m256 sigmoid(const __m256 a) {
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m256 tmp = _mm256_max_ps(a, min);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
-  tmp = exp(tmp);
-  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
-  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
-  return tmp;
-}
-
-__m256 tanh(const __m256 a) {
-  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
-  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
-  tmp = _mm256_min_ps(tmp, max);
-  tmp = exp(tmp);
-  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
-                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
-                       _mm256_set1_ps(1.0f));
-}
-
-__m256 linear(const __m256 a) { return a; }
-
-__m256 relu(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a,
-      _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
-                    _mm256_set1_ps(1.0f)));
-}
-
-__m256 sigmoid(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(_mm256_mul_ps(a, b),
-                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
-}
-
-__m256 tanh(const __m256 a, const __m256 b) {
-  return _mm256_mul_ps(
-      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
-}
-
-__m256 linear(const __m256 a, const __m256 b) { return a; }
-}  // namespace hppl
diff --git a/paddle/legacy/cuda/src/hl_batch_norm.cu b/paddle/legacy/cuda/src/hl_batch_norm.cu
deleted file mode 100644
index f9ffde0d5..000000000
--- a/paddle/legacy/cuda/src/hl_batch_norm.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_batch_norm.h"
-
-__global__ void batchNormInference(real* output,
-                                   const real* input,
-                                   const real* scale,
-                                   const real* bias,
-                                   const real* estimatedMean,
-                                   const real* estimatedVar,
-                                   const double epsilon,
-                                   size_t batchSize,
-                                   size_t channel,
-                                   size_t height,
-                                   size_t width) {
-  const int tid = threadIdx.x;
-  const int num = channel * height * width;
-  const int batch = blockIdx.x;
-  for (int i = tid; i < num; i += blockDim.x) {
-    const int c = i / (height * width);
-    const int id = batch * num + i;
-    real val = input[id] - estimatedMean[c];
-    val /= sqrt(estimatedVar[c] + epsilon);
-    val *= scale[c];
-    val += bias[c];
-    output[id] = val;
-  }
-}
-
-void hl_batch_norm_cuda_inference(const real* input,
-                                  real* output,
-                                  const real* scale,
-                                  const real* bias,
-                                  const real* estimatedMean,
-                                  const real* estimatedVar,
-                                  const double epsilon,
-                                  size_t batchSize,
-                                  size_t channel,
-                                  size_t height,
-                                  size_t width) {
-  batchNormInference<<<batchSize, 256, 0, STREAM_DEFAULT>>>(output,
-                                                            input,
-                                                            scale,
-                                                            bias,
-                                                            estimatedMean,
-                                                            estimatedVar,
-                                                            epsilon,
-                                                            batchSize,
-                                                            channel,
-                                                            height,
-                                                            width);
-
-  CHECK_SYNC("hl_batch_norm_cuda_inference failed!");
-}
diff --git a/paddle/legacy/cuda/src/hl_batch_transpose.cu b/paddle/legacy/cuda/src/hl_batch_transpose.cu
deleted file mode 100644
index 221839905..000000000
--- a/paddle/legacy/cuda/src/hl_batch_transpose.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_batch_transpose.h"
-
-const int TILE_DIM = 64;
-const int BLOCK_ROWS = 16;
-
-// No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(
-    real* odata, const real* idata, int numSamples, int width, int height) {
-  __shared__ float tile[TILE_DIM][TILE_DIM + 1];
-
-  const int x = blockIdx.x * TILE_DIM + threadIdx.x;
-  const int y = blockIdx.y * TILE_DIM + threadIdx.y;
-  const int sampleId = blockIdx.z;
-  if (sampleId > numSamples) return;
-  if (x < width) {
-    for (int j = threadIdx.y; j < TILE_DIM && j < height - y + threadIdx.y;
-         j += BLOCK_ROWS)
-      tile[j][threadIdx.x] =
-          idata[sampleId * width * height + (y + j - threadIdx.y) * width + x];
-  }
-
-  __syncthreads();
-
-  // The matrix is tranposed. Thus height is new width, and width is new height.
-  const int newX = blockIdx.y * TILE_DIM + threadIdx.x;
-  const int newY = blockIdx.x * TILE_DIM + threadIdx.y;
-  if (newX >= height) {
-    return;
-  }
-  for (int j = threadIdx.y; j < TILE_DIM && j < width - newY + threadIdx.y;
-       j += BLOCK_ROWS)
-    odata[sampleId * width * height + (newY + j - threadIdx.y) * height +
-          newX] = tile[threadIdx.x][j];
-}
-
-void batchTranspose(
-    const real* input, real* output, int width, int height, int batchSize) {
-  dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
-  dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-      output, input, batchSize, width, height);
-
-  CHECK_SYNC("batchTranspose failed!");
-}
diff --git a/paddle/legacy/cuda/src/hl_cpu_functions.cc b/paddle/legacy/cuda/src/hl_cpu_functions.cc
deleted file mode 100644
index 1306576bc..000000000
--- a/paddle/legacy/cuda/src/hl_cpu_functions.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include "hl_functions.h"
-
-namespace hppl {
-
-real relu(const real a) { return a > 0.0f ? a : 0.0f; }
-
-real sigmoid(const real a) {
-  const real min = SIGMOID_THRESHOLD_MIN;
-  const real max = SIGMOID_THRESHOLD_MAX;
-  real tmp = (a < min) ? min : ((a > max) ? max : a);
-  return 1.0 / (1.0 + exp(-tmp));
-}
-
-real tanh(const real a) {
-  real tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
-}
-
-real linear(const real a) { return a; }
-
-real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
-
-real sigmoid(const real a, const real b) { return a * b * (1 - b); }
-
-real tanh(const real a, const real b) { return a * (1.0f - b * b); }
-
-real linear(const real a, const real b) { return a; }
-}  // namespace hppl
diff --git a/paddle/legacy/cuda/src/hl_cuda_aggregate.cu b/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
deleted file mode 100644
index 9831c5ecc..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
+++ /dev/null
@@ -1,293 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_aggregate.h"
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_cuda.ph"
-#include "hl_matrix_base.cuh"
-#include "hl_thread.ph"
-#include "paddle/legacy/utils/Logging.h"
-
-/**
- * @brief   matrix row operator.
- */
-template <class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
-  __shared__ real sum_s[blockSize];
-  int cnt = (dimN + blockSize - 1) / blockSize;
-  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
-  int index = rowId * dimN;
-  int tid = threadIdx.x;
-  int lmt = tid;
-
-  real tmp = agg.init();
-  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
-    tmp = agg(tmp, E[index + lmt]);
-    lmt += blockSize;
-  }
-  sum_s[tid] = tmp;
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[rowId] = sum_s[0];
-  }
-}
-
-template <class Agg>
-void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-
-  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      agg, A_d, C_d, dimN);
-}
-
-void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_sum failed");
-}
-
-void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_max failed");
-}
-
-void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_min failed");
-}
-
-/**
- * @brief   matrix column operator.
- */
-template <class Agg>
-__global__ void KeMatrixColumnOp(
-    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  real tmp = agg.init();
-  if (rowIdx < dimN) {
-    for (int index = 0; index < dimM; index++) {
-      tmp = agg(tmp, E[dimN * index + rowIdx]);
-    }
-    Sum[rowIdx] = tmp;
-  }
-}
-
-template <class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(
-    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-  __shared__ real _sum[blockDimX * blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int index = threadIdx.y;
-
-  real tmp = agg.init();
-  if (rowIdx < dimN) {
-    for (; index < dimM;) {
-      tmp = agg(tmp, E[dimN * index + rowIdx]);
-      index += blockDimY;
-    }
-  }
-  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y == 0) {
-      real tmp = agg.init();
-      for (int i = 0; i < blockDimY; i++) {
-        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
-      }
-      Sum[rowIdx] = tmp;
-    }
-  }
-}
-
-template <class Agg>
-void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 - 1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        agg, A_d, C_d, dimM, dimN);
-  } else {
-    int blocksX = (dimN + 32 - 1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        agg, A_d, C_d, dimM, dimN);
-  }
-
-  return;
-}
-
-void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_sum failed");
-}
-
-void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_max failed");
-}
-
-void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_min failed");
-}
-
-template <int blockSize>
-__global__ void KeVectorSum(real *E, real *Sum, int dimM) {
-  __shared__ double sum_s[blockSize];
-  int tid = threadIdx.x;
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
-
-  sum_s[tid] = 0.0f;
-  while (index < dimM) {
-    sum_s[tid] += E[index];
-    index += blockDim.x * gridDim.y;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] += sum_s[tid + stride];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[blockIdx.y] = sum_s[0];
-  }
-}
-
-void hl_vector_sum(real *A_d, real *C_h, int dimM) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_h);
-
-  int blockSize = 128;
-  int gridSize = 128;
-  int blocksX = 1;
-  int blocksY = gridSize;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
-  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {
-  }
-
-  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-      t_resource.gpu_mem, t_resource.cpu_mem, 128);
-
-  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
-  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
-
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-                             << hl_get_device_error_string((size_t)err);
-}
-
-template <int blockSize>
-__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
-  __shared__ double sum_s[blockSize];
-  int tid = threadIdx.x;
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
-
-  sum_s[tid] = 0.0f;
-  while (index < dimM) {
-    sum_s[tid] += abs(E[index]);
-    index += blockDim.x * gridDim.y;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] += sum_s[tid + stride];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[blockIdx.y] = sum_s[0];
-  }
-}
-
-void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_h);
-
-  int blockSize = 128;
-  int gridSize = 128;
-  int blocksX = 1;
-  int blocksY = gridSize;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
-  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {
-  }
-
-  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-      t_resource.gpu_mem, t_resource.cpu_mem, 128);
-
-  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
-  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
-
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-                             << hl_get_device_error_string((size_t)err);
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cnn.cu b/paddle/legacy/cuda/src/hl_cuda_cnn.cu
deleted file mode 100644
index bac743a29..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_cnn.cu
+++ /dev/null
@@ -1,1106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <float.h>
-#include "hl_base.h"
-#include "hl_cnn.h"
-#include "hl_device_functions.cuh"
-
-__global__ void KeMaxPoolForward(const int nthreads,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int height,
-                                 const int width,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int ksizeW,
-                                 const int ksizeH,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int offsetH,
-                                 const int offsetW,
-                                 real* tgtData,
-                                 const int tgtStride,
-                                 real* maskData) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int c = (index / pooledW / pooledH) % channels;
-    int frameNum = index / pooledW / pooledH / channels;
-    int hstart = ph * strideH - offsetH;
-    int wstart = pw * strideW - offsetW;
-    int hend = min(hstart + ksizeH, height);
-    int wend = min(wstart + ksizeW, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    real maxval = -FLT_MAX;
-    int max_index = -1;
-    inputData += (frameNum * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w]) {
-          max_index = h * width + w;
-          maxval = inputData[max_index];
-        }
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = maxval;
-    if (maskData != NULL) {
-      maskData[tgtIndex] = max_index;
-    }
-  }
-}
-
-void hl_maxpool_forward(const int frameCnt,
-                        const real* inputData,
-                        const int channels,
-                        const int height,
-                        const int width,
-                        const int pooledH,
-                        const int pooledW,
-                        const int sizeX,
-                        const int sizeY,
-                        const int strideH,
-                        const int strideW,
-                        const int paddingH,
-                        const int paddingW,
-                        real* tgtData,
-                        const int tgtStride,
-                        real* maskData) {
-  int num_kernels = pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-
-  KeMaxPoolForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                         inputData,
-                                                         channels,
-                                                         height,
-                                                         width,
-                                                         pooledH,
-                                                         pooledW,
-                                                         sizeX,
-                                                         sizeY,
-                                                         strideH,
-                                                         strideW,
-                                                         paddingH,
-                                                         paddingW,
-                                                         tgtData,
-                                                         tgtStride,
-                                                         maskData);
-  CHECK_SYNC("hl_maxpool_forward failed");
-}
-
-__global__ void KeMaxPoolBackward(const int nthreads,
-                                  const real* inputData,
-                                  const real* outData,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int height,
-                                  const int width,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeX,
-                                  const int sizeY,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int padH,
-                                  const int padW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* targetGrad,
-                                  const int outStride) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    // find out the local index
-    // find out the local offset
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetC = (index / width / height) % channels;
-
-    int frameNum = index / width / height / channels;
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
-    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
-    real gradient = 0;
-    real input = inputData[index];
-    outData += (frameNum * outStride + offsetC * pooledH * pooledW);
-    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        if (input == outData[ph * pooledW + pw]) {
-          gradient += outGrad[ph * pooledW + pw];
-        }
-      }
-    }
-    targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient;
-  }
-}
-
-void hl_maxpool_backward(const int frameCnt,
-                         const real* inputData,
-                         const real* outData,
-                         const real* outGrad,
-                         const int channels,
-                         const int height,
-                         const int width,
-                         const int pooledH,
-                         const int pooledW,
-                         const int sizeX,
-                         const int sizeY,
-                         const int strideH,
-                         const int strideW,
-                         const int paddingH,
-                         const int paddingW,
-                         real scaleA,
-                         real scaleB,
-                         real* targetGrad,
-                         const int outStride) {
-  int num_kernels = height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeMaxPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                         inputData,
-                                                         outData,
-                                                         outGrad,
-                                                         channels,
-                                                         height,
-                                                         width,
-                                                         pooledH,
-                                                         pooledW,
-                                                         sizeX,
-                                                         sizeY,
-                                                         strideH,
-                                                         strideW,
-                                                         paddingH,
-                                                         paddingW,
-                                                         scaleA,
-                                                         scaleB,
-                                                         targetGrad,
-                                                         outStride);
-  CHECK_SYNC("hl_maxpool_backward");
-}
-
-__global__ void KeAvgPoolForward(const int nthreads,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int height,
-                                 const int width,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeX,
-                                 const int sizeY,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int padH,
-                                 const int padW,
-                                 real* tgtData,
-                                 const int tgtStride,
-                                 const bool excludeMode) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int c = (index / pooledW / pooledH) % channels;
-    int frameNum = index / pooledW / pooledH / channels;
-
-    int hstart = ph * strideH - padH;
-    int wstart = pw * strideW - padW;
-    int hend = min(hstart + sizeY, height);
-    int wend = min(wstart + sizeX, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    int poolSize =
-        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-
-    real aveval = 0;
-    inputData += (frameNum * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        aveval += inputData[h * width + w];
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / poolSize;
-  }
-}
-
-void hl_avgpool_forward(const int frameCnt,
-                        const real* inputData,
-                        const int channels,
-                        const int height,
-                        const int width,
-                        const int pooledH,
-                        const int pooledW,
-                        const int sizeX,
-                        const int sizeY,
-                        const int strideH,
-                        const int strideW,
-                        const int paddingH,
-                        const int paddingW,
-                        real* tgtData,
-                        const int tgtStride,
-                        const bool excludeMode) {
-  int num_kernels = pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                        inputData,
-                                                        channels,
-                                                        height,
-                                                        width,
-                                                        pooledH,
-                                                        pooledW,
-                                                        sizeX,
-                                                        sizeY,
-                                                        strideH,
-                                                        strideW,
-                                                        paddingH,
-                                                        paddingW,
-                                                        tgtData,
-                                                        tgtStride,
-                                                        excludeMode);
-  CHECK_SYNC("hl_avgpool_forward failed");
-}
-
-__global__ void KeAvgPoolBackward(const int nthreads,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int height,
-                                  const int width,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeX,
-                                  const int sizeY,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int padH,
-                                  const int padW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* tgtGrad,
-                                  const int outStride,
-                                  const bool excludeMode) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetC = (index / width / height) % channels;
-    int frameNum = index / width / height / channels;
-
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
-    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
-    real gradient = 0;
-    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
-
-    for (int ph = phstart; ph < phend; ++ph) {
-      int hstart = ph * strideH - padH;
-      int hend = min(hstart + sizeY, height);
-      hstart = max(hstart, 0);
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        // figure out the pooling size
-        int wstart = pw * strideW - padW;
-        int wend = min(wstart + sizeX, width);
-        wstart = max(wstart, 0);
-        int poolSize =
-            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-        gradient += outGrad[ph * pooledW + pw] / poolSize;
-      }
-    }
-    tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
-  }
-}
-
-void hl_avgpool_backward(const int frameCnt,
-                         const real* outGrad,
-                         const int channels,
-                         const int height,
-                         const int width,
-                         const int pooledH,
-                         const int pooledW,
-                         const int sizeX,
-                         const int sizeY,
-                         const int strideH,
-                         const int strideW,
-                         const int paddingH,
-                         const int paddingW,
-                         real scaleA,
-                         real scaleB,
-                         real* backGrad,
-                         const int outStride,
-                         const bool excludeMode) {
-  int num_kernels = height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeAvgPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                         outGrad,
-                                                         channels,
-                                                         height,
-                                                         width,
-                                                         pooledH,
-                                                         pooledW,
-                                                         sizeX,
-                                                         sizeY,
-                                                         strideH,
-                                                         strideW,
-                                                         paddingH,
-                                                         paddingW,
-                                                         scaleA,
-                                                         scaleB,
-                                                         backGrad,
-                                                         outStride,
-                                                         excludeMode);
-  CHECK_SYNC("hl_avgpool_backward failed");
-}
-
-__global__ void KeMaxPool3DForward(const int nthreads,
-                                   const real* inputData,
-                                   const int channels,
-                                   const int depth,
-                                   const int height,
-                                   const int width,
-                                   const int pooledD,
-                                   const int pooledH,
-                                   const int pooledW,
-                                   const int ksizeD,
-                                   const int ksizeH,
-                                   const int ksizeW,
-                                   const int strideD,
-                                   const int strideH,
-                                   const int strideW,
-                                   const int padD,
-                                   const int padH,
-                                   const int padW,
-                                   real* tgtData,
-                                   real* maxPoolIdxData,
-                                   const int tgtStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int pd = (index / pooledW / pooledH) % pooledD;
-    int c = (index / pooledW / pooledH / pooledD) % channels;
-    int frameNum = index / pooledW / pooledH / pooledD / channels;
-    int dstart = pd * strideD - padD;
-    int hstart = ph * strideH - padH;
-    int wstart = pw * strideW - padW;
-    int dend = min(dstart + ksizeD, depth);
-    int hend = min(hstart + ksizeH, height);
-    int wend = min(wstart + ksizeW, width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    real maxval = -FLT_MAX;
-    int maxIdx = -1;
-    inputData += (frameNum * channels + c) * depth * height * width;
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          if (maxval < inputData[(d * height + h) * width + w]) {
-            maxval = inputData[(d * height + h) * width + w];
-            maxIdx = (d * height + h) * width + w;
-          }
-        }
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = maxval;
-    maxPoolIdxData[tgtIndex] = maxIdx;
-  }
-}
-
-void hl_maxpool3D_forward(const int frameCnt,
-                          const real* inputData,
-                          const int channels,
-                          const int depth,
-                          const int height,
-                          const int width,
-                          const int pooledD,
-                          const int pooledH,
-                          const int pooledW,
-                          const int sizeZ,
-                          const int sizeY,
-                          const int sizeX,
-                          const int strideD,
-                          const int strideH,
-                          const int strideW,
-                          const int padD,
-                          const int padH,
-                          const int padW,
-                          real* tgtData,
-                          real* maxPoolIdxData,
-                          const int tgtStride) {
-  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-
-  KeMaxPool3DForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           inputData,
-                                                           channels,
-                                                           depth,
-                                                           height,
-                                                           width,
-                                                           pooledD,
-                                                           pooledH,
-                                                           pooledW,
-                                                           sizeZ,
-                                                           sizeY,
-                                                           sizeX,
-                                                           strideD,
-                                                           strideH,
-                                                           strideW,
-                                                           padD,
-                                                           padH,
-                                                           padW,
-                                                           tgtData,
-                                                           maxPoolIdxData,
-                                                           tgtStride);
-  CHECK_SYNC("hl_maxpool3D_forward failed");
-}
-
-__global__ void KeMaxPool3DBackward(const int nthreads,
-                                    const real* outGrad,
-                                    const int channels,
-                                    const int depth,
-                                    const int height,
-                                    const int width,
-                                    const int pooledD,
-                                    const int pooledH,
-                                    const int pooledW,
-                                    const int sizeZ,
-                                    const int sizeY,
-                                    const int sizeX,
-                                    const int strideD,
-                                    const int strideH,
-                                    const int strideW,
-                                    const int padD,
-                                    const int padH,
-                                    const int padW,
-                                    real scaleA,
-                                    real scaleB,
-                                    real* targetGrad,
-                                    real* maxPoolIdxData,
-                                    const int outStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int offsetW = index % width;
-    int offsetH = (index / width) % height;
-    int offsetD = (index / width / height) % depth;
-    int offsetC = (index / width / height / depth) % channels;
-    int frameNum = index / width / height / depth / channels;
-
-    int pdstart =
-        (offsetD + padD < sizeZ) ? 0 : (offsetD + padD - sizeZ) / strideD + 1;
-    int phstart =
-        (offsetH + padH < sizeY) ? 0 : (offsetH + padH - sizeY) / strideH + 1;
-    int pwstart =
-        (offsetW + padW < sizeX) ? 0 : (offsetW + padW - sizeX) / strideW + 1;
-    int pdend = min((offsetD + padD) / strideD + 1, pooledD);
-    int phend = min((offsetH + padH) / strideH + 1, pooledH);
-    int pwend = min((offsetW + padW) / strideW + 1, pooledW);
-
-    real gradient = 0;
-    outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
-    maxPoolIdxData +=
-        ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
-    for (int pd = pdstart; pd < pdend; ++pd) {
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (((offsetD * height + offsetH) * width + offsetW) ==
-              maxPoolIdxData[(pd * pooledH + ph) * pooledW + pw])
-            gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
-        }
-      }
-    }
-    targetGrad[index] = scaleA * gradient + scaleB * targetGrad[index];
-  }
-}
-
-void hl_maxpool3D_backward(const int frameCnt,
-                           const real* outGrad,
-                           const int channels,
-                           const int depth,
-                           const int height,
-                           const int width,
-                           const int outputD,
-                           const int outputH,
-                           const int outputW,
-                           const int sizeZ,
-                           const int sizeY,
-                           const int sizeX,
-                           const int strideD,
-                           const int strideH,
-                           const int strideW,
-                           const int paddingD,
-                           const int paddingH,
-                           const int paddingW,
-                           real scaleA,
-                           real scaleB,
-                           real* targetGrad,
-                           real* maxPoolIdxData,
-                           const int outStride) {
-  int num_kernels = depth * height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           outGrad,
-                                                           channels,
-                                                           depth,
-                                                           height,
-                                                           width,
-                                                           outputD,
-                                                           outputH,
-                                                           outputW,
-                                                           sizeZ,
-                                                           sizeY,
-                                                           sizeX,
-                                                           strideD,
-                                                           strideH,
-                                                           strideW,
-                                                           paddingD,
-                                                           paddingH,
-                                                           paddingW,
-                                                           scaleA,
-                                                           scaleB,
-                                                           targetGrad,
-                                                           maxPoolIdxData,
-                                                           outStride);
-  CHECK_SYNC("hl_maxpool3D_backward");
-}
-
-__global__ void KeAvgPool3DForward(const int nthreads,
-                                   const real* inputData,
-                                   const int channels,
-                                   const int depth,
-                                   const int height,
-                                   const int width,
-                                   const int pooledD,
-                                   const int pooledH,
-                                   const int pooledW,
-                                   const int sizeZ,
-                                   const int sizeY,
-                                   const int sizeX,
-                                   const int strideD,
-                                   const int strideH,
-                                   const int strideW,
-                                   const int padD,
-                                   const int padH,
-                                   const int padW,
-                                   real* tgtData,
-                                   const int tgtStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int pd = (index / pooledW / pooledH) % pooledD;
-    int c = (index / pooledW / pooledH / pooledD) % channels;
-    int frameNum = index / pooledW / pooledH / pooledD / channels;
-    int dstart = pd * strideD - padD;
-    int hstart = ph * strideH - padH;
-    int wstart = pw * strideW - padW;
-    int dend = min(dstart + sizeZ, depth);
-    int hend = min(hstart + sizeY, height);
-    int wend = min(wstart + sizeX, width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-
-    real aveval = 0;
-    inputData += (frameNum * channels + c) * depth * height * width;
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          aveval += inputData[(d * height + h) * width + w];
-        }
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / pool_size;
-  }
-}
-
-void hl_avgpool3D_forward(const int frameCnt,
-                          const real* inputData,
-                          const int channels,
-                          const int depth,
-                          const int height,
-                          const int width,
-                          const int pooledD,
-                          const int pooledH,
-                          const int pooledW,
-                          const int sizeZ,
-                          const int sizeY,
-                          const int sizeX,
-                          const int strideD,
-                          const int strideH,
-                          const int strideW,
-                          const int paddingD,
-                          const int paddingH,
-                          const int paddingW,
-                          real* tgtData,
-                          const int tgtStride) {
-  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPool3DForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          inputData,
-                                                          channels,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          pooledD,
-                                                          pooledH,
-                                                          pooledW,
-                                                          sizeZ,
-                                                          sizeY,
-                                                          sizeX,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          tgtData,
-                                                          tgtStride);
-  CHECK_SYNC("hl_avgpool3D_forward failed");
-}
-
-__global__ void KeAvgPool3DBackward(const int nthreads,
-                                    const real* outGrad,
-                                    const int channels,
-                                    const int depth,
-                                    const int height,
-                                    const int width,
-                                    const int pooledD,
-                                    const int pooledH,
-                                    const int pooledW,
-                                    const int sizeZ,
-                                    const int sizeY,
-                                    const int sizeX,
-                                    const int strideD,
-                                    const int strideH,
-                                    const int strideW,
-                                    const int padD,
-                                    const int padH,
-                                    const int padW,
-                                    real scaleA,
-                                    real scaleB,
-                                    real* tgtGrad,
-                                    const int outStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetD = (index / width / height) % depth + padD;
-    int offsetC = (index / width / height / depth) % channels;
-    int frameNum = index / width / height / depth / channels;
-
-    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int pdend = min(offsetD / strideD + 1, pooledD);
-    int phend = min(offsetH / strideH + 1, pooledH);
-    int pwend = min(offsetW / strideW + 1, pooledW);
-
-    real gradient = 0;
-    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
-
-    for (int pd = pdstart; pd < pdend; ++pd) {
-      int dstart = pd * strideD - padD;
-      int dend = min(dstart + sizeZ, depth);
-      dstart = max(dstart, 0);
-      for (int ph = phstart; ph < phend; ++ph) {
-        int hstart = ph * strideH - padH;
-        int hend = min(hstart + sizeY, height);
-        hstart = max(hstart, 0);
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          // figure out the pooling size
-          int wstart = pw * strideW - padW;
-          int wend = min(wstart + sizeX, width);
-          wstart = max(wstart, 0);
-          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
-        }
-      }
-    }
-    tgtGrad[index] = scaleA * gradient + scaleB * tgtGrad[index];
-  }
-}
-
-void hl_avgpool3D_backward(const int frameCnt,
-                           const real* outGrad,
-                           const int channels,
-                           const int depth,
-                           const int height,
-                           const int width,
-                           const int outputD,
-                           const int outputH,
-                           const int outputW,
-                           const int sizeZ,
-                           const int sizeY,
-                           const int sizeX,
-                           const int strideD,
-                           const int strideH,
-                           const int strideW,
-                           int paddingD,
-                           int paddingH,
-                           int paddingW,
-                           real scaleA,
-                           real scaleB,
-                           real* backGrad,
-                           const int outStride) {
-  int num_kernels = depth * height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeAvgPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           outGrad,
-                                                           channels,
-                                                           depth,
-                                                           height,
-                                                           width,
-                                                           outputD,
-                                                           outputH,
-                                                           outputW,
-                                                           sizeZ,
-                                                           sizeY,
-                                                           sizeX,
-                                                           strideD,
-                                                           strideH,
-                                                           strideW,
-                                                           paddingD,
-                                                           paddingH,
-                                                           paddingW,
-                                                           scaleA,
-                                                           scaleB,
-                                                           backGrad,
-                                                           outStride);
-  CHECK_SYNC("hl_avgpool3D_backward failed");
-}
-
-__global__ void KeBilinearInterpFw(const real* in,
-                                   const size_t inImgH,
-                                   const size_t inImgW,
-                                   const size_t inputH,
-                                   const size_t inputW,
-                                   real* out,
-                                   const size_t outImgH,
-                                   const size_t outImgW,
-                                   const size_t outputH,
-                                   const size_t outputW,
-                                   const size_t numChannels,
-                                   const real ratioH,
-                                   const real ratioW) {
-  int nthreads = outputH * outputW;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < nthreads) {
-    int outIdH = tid / outputW;
-    int outIdW = tid % outputW;
-    int inImgSize = inputW / numChannels;
-    int outImgSize = outputW / numChannels;
-    int channelId = outIdW / outImgSize;
-
-    int outImgIdy = (outIdW % outImgSize) / outImgW;
-    int inImgIdy = ratioH * outImgIdy;
-    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
-    real h1lambda = ratioH * outImgIdy - inImgIdy;
-    real h2lambda = 1.f - h1lambda;
-
-    int outImgIdx = tid % outImgW;
-    int inImgIdx = ratioW * outImgIdx;
-    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
-    real w1lambda = ratioW * outImgIdx - inImgIdx;
-    real w2lambda = 1.f - w1lambda;
-
-    const real* inPos = &in[outIdH * inputW + channelId * inImgSize +
-                            inImgIdy * inImgW + inImgIdx];
-
-    // bilinear interpolation
-    out[outIdH * outputW + outIdW] =
-        h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) +
-        h1lambda * (w2lambda * inPos[hId * inImgW] +
-                    w1lambda * inPos[hId * inImgW + wId]);
-  }
-}
-
-void hl_bilinear_forward(const real* inData,
-                         const size_t inImgH,
-                         const size_t inImgW,
-                         const size_t inputH,
-                         const size_t inputW,
-                         real* outData,
-                         const size_t outImgH,
-                         const size_t outImgW,
-                         const size_t outputH,
-                         const size_t outputW,
-                         const size_t numChannels,
-                         const real ratioH,
-                         const real ratioW) {
-  int threadNum = outputH * outputW;
-  int blocks = (threadNum + 1024 - 1) / 1024;
-
-  KeBilinearInterpFw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inData,
-                                                          inImgH,
-                                                          inImgW,
-                                                          inputH,
-                                                          inputW,
-                                                          outData,
-                                                          outImgH,
-                                                          outImgW,
-                                                          outputH,
-                                                          outputW,
-                                                          numChannels,
-                                                          ratioH,
-                                                          ratioW);
-  CHECK_SYNC("hl_bilinear_forward failed");
-}
-
-__global__ void KeBilinearInterpBw(real* in,
-                                   const size_t inImgH,
-                                   const size_t inImgW,
-                                   const size_t inputH,
-                                   const size_t inputW,
-                                   const real* out,
-                                   const size_t outImgH,
-                                   const size_t outImgW,
-                                   const size_t outputH,
-                                   const size_t outputW,
-                                   const size_t numChannels,
-                                   const real ratioH,
-                                   const real ratioW) {
-  int nthreads = outputH * outputW;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < nthreads) {
-    int outIdH = tid / outputW;
-    int outIdW = tid % outputW;
-    int inImgSize = inputW / numChannels;
-    int outImgSize = outputW / numChannels;
-    int channelId = outIdW / outImgSize;
-
-    int outImgIdy = (outIdW % outImgSize) / outImgW;
-    int inImgIdy = ratioH * outImgIdy;
-    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
-    real h1lambda = ratioH * outImgIdy - inImgIdy;
-    real h2lambda = 1.f - h1lambda;
-
-    int outImgIdx = tid % outImgW;
-    int inImgIdx = ratioW * outImgIdx;
-    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
-    real w1lambda = ratioW * outImgIdx - inImgIdx;
-    real w2lambda = 1.f - w1lambda;
-
-    real* inPos = &in[outIdH * inputW + channelId * inImgSize +
-                      inImgIdy * inImgW + inImgIdx];
-    const real* outPos = &out[outIdH * outputW + outIdW];
-    paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW],
-                            h1lambda * w2lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId],
-                            h1lambda * w1lambda * outPos[0]);
-  }
-}
-
-void hl_bilinear_backward(real* inGrad,
-                          const size_t inImgH,
-                          const size_t inImgW,
-                          const size_t inputH,
-                          const size_t inputW,
-                          const real* outGrad,
-                          const size_t outImgH,
-                          const size_t outImgW,
-                          const size_t outputH,
-                          const size_t outputW,
-                          const size_t numChannels,
-                          const real ratioH,
-                          const real ratioW) {
-  int threadNum = outputH * outputW;
-  int blocks = (threadNum + 1024 - 1) / 1024;
-
-  KeBilinearInterpBw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inGrad,
-                                                          inImgH,
-                                                          inImgW,
-                                                          inputH,
-                                                          inputW,
-                                                          outGrad,
-                                                          outImgH,
-                                                          outImgW,
-                                                          outputH,
-                                                          outputW,
-                                                          numChannels,
-                                                          ratioH,
-                                                          ratioW);
-  CHECK_SYNC("hl_bilinear_backward failed");
-}
-
-__global__ void maxoutFpCompute(size_t nthreads,
-                                const real* inData,
-                                real* outData,
-                                int* idData,
-                                size_t size,
-                                size_t featLen,
-                                size_t groups) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    size_t batch_idx = index / size;
-    size_t i = index % size;
-    size_t channel_idx = i / featLen;
-    size_t feat_idx = i % featLen;
-    size_t data_idx =
-        (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
-    real max = inData[data_idx];
-    int maxId = 0;
-    for (size_t g = 1; g < groups; ++g) {
-      real tmp = inData[data_idx + g * featLen];
-      if (tmp > max) {
-        max = tmp;
-        maxId = g;
-      }
-    }
-    outData[index] = max;
-    idData[index] = maxId;
-  }
-}
-
-void hl_maxout_forward(const real* inData,
-                       real* outData,
-                       int* idData,
-                       size_t batchSize,
-                       size_t size,
-                       size_t featLen,
-                       size_t groups) {
-  int num_kernels = size * batchSize;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutFpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
-      num_kernels, inData, outData, idData, size, featLen, groups);
-  CHECK_SYNC("hl_maxout_forward failed");
-}
-
-__global__ void maxoutBpCompute(size_t nthreads,
-                                real* inGrad,
-                                const real* outGrad,
-                                const int* idData,
-                                size_t size,
-                                size_t featLen,
-                                size_t groups) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    size_t batch_idx = index / size;
-    size_t i = index % size;
-    size_t channel_idx = i / featLen;
-    size_t feat_idx = i % featLen;
-    size_t newIndex = batch_idx * size;
-    size_t gradIdx =
-        (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
-    (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
-  }
-}
-
-void hl_maxout_backward(real* inGrad,
-                        const real* outGrad,
-                        const int* idData,
-                        size_t batchSize,
-                        size_t size,
-                        size_t featLen,
-                        size_t groups) {
-  int num_kernels = size * batchSize;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutBpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
-      num_kernels, inGrad, outGrad, idData, size, featLen, groups);
-  CHECK_SYNC("hl_maxout_backward failed");
-}
-
-__global__ void upsampleForwardCompute(real* input_data,
-                                       real* mask_data,
-                                       size_t nthreads,
-                                       size_t in_h,
-                                       size_t in_w,
-                                       size_t out_h,
-                                       size_t out_w,
-                                       real* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int offset = index / (in_w * in_h) * out_h * out_w;
-    int upsample_idx = static_cast<int>(mask_data[index]);
-    output_data[offset + upsample_idx] = input_data[index];
-  }
-}
-
-__global__ void upsampleBackwardCompute(real* out_grad,
-                                        real* mask_data,
-                                        size_t nthreads,
-                                        size_t in_h,
-                                        size_t in_w,
-                                        size_t out_h,
-                                        size_t out_w,
-                                        real* input_grad) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int offset = index / (in_w * in_h) * out_h * out_w;
-    int upsample_idx = static_cast<int>(mask_data[index]);
-    input_grad[index] = out_grad[offset + upsample_idx];
-  }
-}
-
-void hl_upsample_forward(real* inputData,
-                         real* maskData,
-                         size_t batchSize,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t channels,
-                         size_t outputH,
-                         size_t outputW,
-                         real* outputData) {
-  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  upsampleForwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inputData,
-                                                              maskData,
-                                                              num_kernels,
-                                                              imgSizeH,
-                                                              imgSizeW,
-                                                              outputH,
-                                                              outputW,
-                                                              outputData);
-  CHECK_SYNC("hl_upsample_forward failed");
-}
-
-void hl_upsample_backward(real* outputGradData,
-                          real* maskData,
-                          size_t batchSize,
-                          size_t imgSizeH,
-                          size_t imgSizeW,
-                          size_t channels,
-                          size_t outputH,
-                          size_t outputW,
-                          real* inputGradData) {
-  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  upsampleBackwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(outputGradData,
-                                                               maskData,
-                                                               num_kernels,
-                                                               imgSizeH,
-                                                               imgSizeW,
-                                                               outputH,
-                                                               outputW,
-                                                               inputGradData);
-  CHECK_SYNC("hl_upsample_backward failed");
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cublas.cc b/paddle/legacy/cuda/src/hl_cuda_cublas.cc
deleted file mode 100644
index 283b8b6e9..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_cublas.cc
+++ /dev/null
@@ -1,400 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda_cublas.h"
-#include <sys/time.h>
-#include "hl_cuda.h"
-#include "hl_thread.ph"
-#include "paddle/legacy/utils/DynamicLoader.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace dynload {
-
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    cublasStatus_t operator()(Args... args) {                                  \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
-      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    cublasStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name;  // struct DynLoad__##__name
-#endif
-
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
-// include all needed cublas functions in HPPL
-// clang-format off
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv)                    \
-  __macro(cublasDgemv)                    \
-  __macro(cublasSgemm)                    \
-  __macro(cublasDgemm)                    \
-  __macro(cublasSgeam)                    \
-  __macro(cublasDgeam)                    \
-
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
-CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
-
-#undef DYNAMIC_LOAD_CUBLAS_WRAP
-#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
-#undef CUBLAS_BLAS_ROUTINE_EACH
-
-} /* namespace dynload */
-
-// clang-format on
-#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM dynload::cublasSgeam
-#define CUBLAS_GEMV dynload::cublasSgemv
-#define CUBLAS_GEMM dynload::cublasSgemm
-#define CUBLAS_GETRF dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI dynload::cublasSgetriBatched
-#else
-#define CUBLAS_GEAM dynload::cublasDgeam
-#define CUBLAS_GEMV dynload::cublasDgemv
-#define CUBLAS_GEMM dynload::cublasDgemm
-#define CUBLAS_GETRF dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI dynload::cublasDgetriBatched
-#endif
-
-const char *hl_cublas_get_error_string(cublasStatus_t status) {
-  switch (status) {
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "[cublas status]: not initialized";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "[cublas status]: allocate failed";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "[cublas status]: invalid value";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "[cublas status]: arch mismatch";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "[cublas status]: mapping error";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "[cublas status]: execution failed";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "[cublas status]: internal error";
-    case CUBLAS_STATUS_SUCCESS:
-      return "[cublas status]: success";
-    default:
-      return "[cublas status]: unknown error";
-  }
-}
-
-/**
- * Check build-in cublas function using glog and it also
- * support << operator for more details error info.
- */
-cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func)               \
-  g_cublasStat = cublas_func;                   \
-  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
-      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
-
-void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
-  CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
-      << "[cublas init] Cublas create handle faild!";
-
-  CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
-      << "[cublas init] Cublas set stream faild!";
-}
-
-void hl_matrix_transpose(
-    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
-  real alpha = 1.0;
-  real beta = 0.0;
-
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           dimM,
-                           dimN,
-                           &alpha,
-                           A_d,
-                           lda,
-                           &beta,
-                           nullptr,
-                           dimM,
-                           C_d,
-                           ldc));
-  CHECK_SYNC("hl_matrix_transpose failed");
-}
-
-void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
-  hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
-}
-
-void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
-  /* Solve Ax = I */
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  /* Step 1: Compute the LU decomposition of matrix A */
-  real **inout_h = &A_d;
-  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
-  hl_memcpy(inout_d, inout_h, sizeof(real *));
-
-  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
-  int *info_d = (int *)t_resource.gpu_mem;
-
-  /* Note: cublasSgetrfBatched is used to calculate a number of
-     small-sized matrices. There may be a better way to reconstruct
-     the API for better performance.
-   */
-  CHECK_CUBLAS(
-      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
-
-  int info_h;
-  hl_memcpy(&info_h, info_d, sizeof(int));
-  if (info_h != 0) {
-    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
-  }
-
-  /* Step 2: Compute the inverse of the matrix given its LU decomposition */
-  real **out_h = &C_d;
-  real **out_d = (real **)hl_malloc_device(sizeof(real *));
-  hl_memcpy(out_d, out_h, sizeof(real *));
-
-  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-                            dimN,
-                            (const real **)inout_d,
-                            lda,
-                            pivot_d,
-                            out_d,
-                            ldc,
-                            info_d,
-                            1));
-
-  hl_memcpy(&info_h, info_d, sizeof(int));
-  if (info_h != 0) {
-    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
-  }
-
-  hl_free_mem_device(inout_d);
-  hl_free_mem_device(pivot_d);
-  hl_free_mem_device(out_d);
-
-  CHECK_SYNC("hl_matrix_inverse failed");
-}
-
-void hl_matrix_mul(real *A_d,
-                   hl_trans_op_t transa,
-                   real *B_d,
-                   hl_trans_op_t transb,
-                   real *C_d,
-                   int dimM,
-                   int dimN,
-                   int dimK,
-                   real alpha,
-                   real beta,
-                   int lda,
-                   int ldb,
-                   int ldc) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
-    int m = (transa == HPPL_OP_N) ? dimM : dimK;
-    int n = (transa == HPPL_OP_N) ? dimK : dimM;
-    hl_matrix_mul_vector(
-        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
-    return;
-  }
-
-  if (dimM == 1 && dimN != 1 && dimK != 1 && transa == HPPL_OP_N) {
-    int m = (transb == HPPL_OP_N) ? dimK : dimN;
-    int n = (transb == HPPL_OP_N) ? dimN : dimK;
-    hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
-    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
-    return;
-  }
-
-  cublasStatus_t stat;
-  if ((HPPL_OP_N == transa) && (HPPL_OP_N == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_T,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_T,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
-  CHECK_SYNC("hl_matrix_mul failed");
-}
-
-void hl_matrix_mul(real *A_d,
-                   hl_trans_op_t transa,
-                   real *B_d,
-                   hl_trans_op_t transb,
-                   real *C_d,
-                   int dimM,
-                   int dimN,
-                   int dimK,
-                   real alpha,
-                   real beta) {
-  int lda = (HPPL_OP_N == transa) ? dimK : dimM;
-  int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
-  int ldc = dimN;
-
-  hl_matrix_mul(A_d,
-                transa,
-                B_d,
-                transb,
-                C_d,
-                dimM,
-                dimN,
-                dimK,
-                alpha,
-                beta,
-                lda,
-                ldb,
-                ldc);
-}
-
-void hl_matrix_mul_vector(real *A_d,
-                          hl_trans_op_t trans,
-                          real *B_d,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta,
-                          int lda,
-                          int incb,
-                          int incc) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  cublasStatus_t stat;
-  if (HPPL_OP_N == trans) {
-    stat = CUBLAS_GEMV(t_resource.handle,
-                       CUBLAS_OP_T,
-                       dimN,
-                       dimM,
-                       &alpha,
-                       A_d,
-                       lda,
-                       B_d,
-                       incb,
-                       &beta,
-                       C_d,
-                       incc);
-  } else if (HPPL_OP_T == trans) {
-    stat = CUBLAS_GEMV(t_resource.handle,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       &alpha,
-                       A_d,
-                       lda,
-                       B_d,
-                       incb,
-                       &beta,
-                       C_d,
-                       incc);
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
-  CHECK_SYNC("hl_matrix_mul_vector");
-}
-
-void hl_matrix_mul_vector(real *A_d,
-                          hl_trans_op_t trans,
-                          real *B_d,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta) {
-  hl_matrix_mul_vector(
-      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cudnn.cc b/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
deleted file mode 100644
index b0ac5aaac..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
+++ /dev/null
@@ -1,1117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda_cudnn.h"
-#include <cudnn.h>
-#include <gflags/gflags.h>
-#include "hl_cuda_cudnn.ph"
-#include "hl_thread.ph"
-#include "paddle/legacy/utils/DynamicLoader.h"
-#include "paddle/legacy/utils/Logging.h"
-
-DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
-             4096,
-             "Specify cuDNN max workspace limit, in units MB, "
-             "4096MB=4GB by default.");
-
-namespace dynload {
-
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cudbnn routine
- * via operator overloading: operator ()
- *
- * note: default dynamic linked libs
- **/
-
-#ifdef PADDLE_USE_DSO
-
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> decltype(__name(args...)) {            \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
-    }                                                                       \
-  } __name; /* struct DynLoad__##__name */
-
-#else
-
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-
-#endif
-
-/**
- * include all needed cudnn functions in HPPL
- * different cudnn version has different interfaces
- **/
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensor4dDescriptorEx)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetFilter4dDescriptor)                     \
-  __macro(cudnnSetPooling2dDescriptor)                    \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnSetConvolution2dDescriptor)                \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)                            \
-  __macro(cudnnGetVersion)                                \
-  __macro(cudnnGetErrorString)
-CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-#endif
-
-
-// APIs available after R4:
-#if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
-  __macro(cudnnBatchNormalizationForwardTraining)            \
-  __macro(cudnnBatchNormalizationForwardInference)           \
-  __macro(cudnnBatchNormalizationBackward)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
-#endif
-
-// APIs in R5
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)
-CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
-#endif
-
-#undef CUDNN_DNN_ROUTINE_EACH
-// clang-format on
-} /* namespace dynload */
-
-/**
- * Check build-in cudnn function using glog and it **does not**
- * support << operator for more details error info.
- */
-#define CHECK_CUDNN(cudnnFunc)                                         \
-  do {                                                                 \
-    cudnnStatus_t cudnnStat = cudnnFunc;                               \
-    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
-        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
-  } while (0)
-
-bool g_is_libcudnn_init = false;
-int g_cudnn_lib_version = 0;
-
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
-}
-
-void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
-  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
-  // Compare cudnn header version with that of cudnn.so.
-  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
-        (cudnn_cuh_major == cudnn_dso_major))
-      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
-      << cudnn_cuh_major << " unmatched!\n"
-      << "PaddlePaddle Requirement: "
-      << "(header v[2-3] with libcudnn v[2-3]) Or "
-      << "(header v4 with libcudnn v4) Or "
-      << "(header v5 with libcudnn v5) Or"
-      << "(header v6 with libcudnn v6).";
-
-  CHECK(!(CUDNN_VERSION < 6000 && CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
-      << "cudnn v5 requires cuda version >= 7.5";
-
-  CHECK(!(CUDNN_VERSION >= 6000 && CUDA_VERSION < 8000))
-      << "cudnn v6 requires cuda version >= 8.0";
-
-  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
-  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
-  g_is_libcudnn_init = true;
-  g_cudnn_lib_version = cudnn_dso_ver;
-}
-
-int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
-
-void hl_conv_workspace(hl_tensor_descriptor input,
-                       hl_tensor_descriptor output,
-                       hl_filter_descriptor filter,
-                       hl_convolution_descriptor conv,
-                       int* convFwdAlgo,
-                       size_t* fwdLimitBytes,
-                       int* convBwdDataAlgo,
-                       size_t* bwdDataLimitBytes,
-                       int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes,
-                       bool useDilation) {
-#if CUDNN_VERSION >= 4000
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-
-  // Specify workspace limit directly
-  size_t memoryLimitBytes =
-      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
-  // For dilation
-  int algo = 0;
-
-  // cudnn convolution forward configuration
-  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  // cudnn convolution backward data configuration
-  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  // cudnn convolution backward filter configuration
-  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  if (useDilation) {
-    convFwdAlgo = &algo;
-    convBwdDataAlgo = &algo;
-    convBwdFilterAlgo = &algo;
-  } else {
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-        t_resource.cudnn_handle,
-        fwd_src_desc,
-        fwd_filter_desc,
-        fwd_conv_desc,
-        fwd_dest_desc,
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_data_filter_desc,
-        bwd_data_diff_desc,
-        bwd_data_conv_desc,
-        bwd_data_grad_desc,
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_filter_src_desc,
-        bwd_filter_diff_desc,
-        bwd_filter_conv_desc,
-        bwd_filter_grad_desc,
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-  }
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
-      t_resource.cudnn_handle,
-      fwd_src_desc,
-      fwd_filter_desc,
-      fwd_conv_desc,
-      fwd_dest_desc,
-      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
-      fwdLimitBytes));
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-      t_resource.cudnn_handle,
-      bwd_data_filter_desc,
-      bwd_data_diff_desc,
-      bwd_data_conv_desc,
-      bwd_data_grad_desc,
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-      bwdDataLimitBytes));
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-      t_resource.cudnn_handle,
-      bwd_filter_src_desc,
-      bwd_filter_diff_desc,
-      bwd_filter_conv_desc,
-      bwd_filter_grad_desc,
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-      bwdFilterLimitBytes));
-
-#endif
-}
-
-void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
-                                 int batch_size,
-                                 int feature_maps,
-                                 int height,
-                                 int width) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc =
-      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-  CHECK_NOTNULL(hl_desc);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  batch_size,
-                                                  feature_maps,
-                                                  height,
-                                                  width));
-
-  hl_desc->format = CUDNN_TENSOR_NCHW;
-  hl_desc->data_type = data_type;
-  hl_desc->batch_size = batch_size;
-  hl_desc->feature_maps = feature_maps;
-  hl_desc->height = height;
-  hl_desc->width = width;
-
-  *image_desc = (hl_tensor_descriptor)hl_desc;
-}
-
-void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc =
-      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-  CHECK_NOTNULL(hl_desc);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-  hl_desc->data_type = data_type;
-
-  *image_desc = (hl_tensor_descriptor)hl_desc;
-}
-
-void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                       int batch_size,
-                       int feature_maps,
-                       int height,
-                       int width) {
-  const int stride_w = 1;
-  const int stride_h = width * stride_w;
-  const int stride_c = height * stride_h;
-  const int stride_n = feature_maps * stride_c;
-  return hl_tensor_reshape(image_desc,
-                           batch_size,
-                           feature_maps,
-                           height,
-                           width,
-                           stride_n,
-                           stride_c,
-                           stride_h,
-                           stride_w);
-}
-
-void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                       int batch_size,
-                       int feature_maps,
-                       int height,
-                       int width,
-                       int nStride,
-                       int cStride,
-                       int hStride,
-                       int wStride) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-  CHECK_NOTNULL(hl_desc->desc);
-
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
-                                                    hl_desc->data_type,
-                                                    batch_size,
-                                                    feature_maps,
-                                                    height,
-                                                    width,
-                                                    nStride,
-                                                    cStride,
-                                                    hStride,
-                                                    wStride));
-
-  hl_desc->batch_size = batch_size;
-  hl_desc->feature_maps = feature_maps;
-  hl_desc->height = height;
-  hl_desc->width = width;
-}
-
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-  CHECK_NOTNULL(hl_desc->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
-
-  hl_desc->desc = NULL;
-
-  free(image_desc);
-}
-
-void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
-                                  hl_pooling_mode_t mode,
-                                  int height,
-                                  int width,
-                                  int height_padding,
-                                  int width_padding,
-                                  int stride_height,
-                                  int stride_width) {
-  cudnnPoolingMode_t cudnn_mode;
-  switch (mode) {
-    case HL_POOLING_MAX:
-      cudnn_mode = CUDNN_POOLING_MAX;
-      break;
-    case HL_POOLING_AVERAGE:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-      break;
-    case HL_POOLING_AVERAGE_INCLUDE_PADDING:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-      break;
-    default:
-      LOG(FATAL) << "parameter mode error";
-  }
-
-  CHECK_NOTNULL(pooling_desc);
-
-  cudnn_pooling_descriptor hl_pooling_desc =
-      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
-  CHECK_NOTNULL(hl_pooling_desc);
-
-  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
-  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
-                                                   cudnn_mode,
-#if CUDNN_VERSION >= 5000
-                                                   CUDNN_PROPAGATE_NAN,
-#endif
-                                                   height,
-                                                   width,
-                                                   height_padding,
-                                                   width_padding,
-                                                   stride_height,
-                                                   stride_width));
-
-  hl_pooling_desc->mode = cudnn_mode;
-  hl_pooling_desc->window_height = height;
-  hl_pooling_desc->window_width = width;
-  hl_pooling_desc->stride_height = stride_height;
-  hl_pooling_desc->stride_width = stride_width;
-
-  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
-}
-
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
-  CHECK_NOTNULL(pooling_desc);
-
-  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-
-  CHECK_NOTNULL(hl_pooling->desc);
-  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
-
-  hl_pooling->desc = NULL;
-
-  free(pooling_desc);
-}
-
-void hl_pooling_forward(hl_tensor_descriptor input,
-                        real* input_image,
-                        hl_tensor_descriptor output,
-                        real* output_image,
-                        hl_pooling_descriptor pooling) {
-  cudnnPoolingDescriptor_t pooling_desc;
-  cudnnTensorDescriptor_t input_desc;
-  cudnnTensorDescriptor_t output_desc;
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(pooling);
-  CHECK_NOTNULL(input_image);
-  CHECK_NOTNULL(output_image);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  input_desc = ((cudnn_tensor_descriptor)input)->desc;
-  output_desc = ((cudnn_tensor_descriptor)output)->desc;
-  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
-                                           pooling_desc,
-                                           &alpha,
-                                           input_desc,
-                                           input_image,
-                                           &beta,
-                                           output_desc,
-                                           output_image));
-  CHECK_SYNC("hl_pooling_forward failed");
-}
-
-void hl_pooling_backward(hl_tensor_descriptor input,
-                         real* input_image,
-                         real* input_image_grad,
-                         hl_tensor_descriptor output,
-                         real* output_image,
-                         real* output_image_grad,
-                         hl_pooling_descriptor pooling) {
-  cudnnPoolingDescriptor_t pooling_desc;
-  cudnnTensorDescriptor_t input_desc;
-  cudnnTensorDescriptor_t output_desc;
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(pooling);
-  CHECK_NOTNULL(input_image);
-  CHECK_NOTNULL(input_image_grad);
-  CHECK_NOTNULL(output_image);
-  CHECK_NOTNULL(output_image_grad);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  input_desc = ((cudnn_tensor_descriptor)input)->desc;
-  output_desc = ((cudnn_tensor_descriptor)output)->desc;
-  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
-                                            pooling_desc,
-                                            &alpha,
-                                            output_desc,
-                                            output_image,
-                                            output_desc,
-                                            output_image_grad,
-                                            input_desc,
-                                            input_image,
-                                            &beta,
-                                            input_desc,
-                                            input_image_grad));
-  CHECK_SYNC("hl_pooling_backward failed");
-}
-
-void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                 int input_feature_maps,
-                                 int output_feature_maps,
-                                 int height,
-                                 int width) {
-  CHECK_NOTNULL(filter);
-
-  cudnn_filter_descriptor hl_filter =
-      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
-  CHECK_NOTNULL(hl_filter);
-
-  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
-                                                  data_type,
-#if CUDNN_VERSION >= 5000
-                                                  CUDNN_TENSOR_NCHW,
-#endif
-                                                  output_feature_maps,
-                                                  input_feature_maps,
-                                                  height,
-                                                  width));
-
-  hl_filter->data_type = data_type;
-  hl_filter->output_feature_maps = output_feature_maps;
-  hl_filter->input_feature_maps = input_feature_maps;
-  hl_filter->filter_height = height;
-  hl_filter->filter_width = width;
-
-  *filter = (hl_filter_descriptor)hl_filter;
-}
-
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
-  CHECK_NOTNULL(filter);
-
-  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
-  CHECK_NOTNULL(hl_filter->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
-
-  hl_filter->desc = NULL;
-
-  free(filter);
-}
-
-void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-                                      hl_tensor_descriptor image,
-                                      hl_filter_descriptor filter,
-                                      int padding_height,
-                                      int padding_width,
-                                      int stride_height,
-                                      int stride_width,
-                                      int dilation_h,
-                                      int dilation_w) {
-  CHECK_NOTNULL(conv);
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
-      sizeof(_cudnn_convolution_descriptor));
-
-  CHECK_NOTNULL(hl_conv);
-  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
-  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-
-#if CUDNN_VERSION >= 6000
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode,
-                                                       data_type));
-#else
-  if (dilation_h > 1 || dilation_w > 1) {
-    LOG(FATAL)
-        << "Current cuDNN version does't support for dilation convolution. "
-        << "The dilation convolution requires cuDNN >= v6.0.";
-  }
-
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode));
-#endif
-
-  hl_conv->input_image = image;
-  hl_conv->filter = filter;
-  hl_conv->padding_height = padding_height;
-  hl_conv->padding_width = padding_width;
-  hl_conv->stride_height = stride_height;
-  hl_conv->stride_width = stride_width;
-  hl_conv->upscalex = 1;
-  hl_conv->upscaley = 1;
-  hl_conv->mode = mode;
-
-  *conv = (hl_convolution_descriptor)hl_conv;
-}
-
-void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-                                     hl_tensor_descriptor image,
-                                     hl_filter_descriptor filter,
-                                     int padding_height,
-                                     int padding_width,
-                                     int stride_height,
-                                     int stride_width,
-                                     int dilation_h,
-                                     int dilation_w) {
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(image);
-  CHECK_NOTNULL(filter);
-
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-
-#if CUDNN_VERSION >= 6000
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode,
-                                                       data_type));
-#else
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode));
-#endif
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-  hl_conv->input_image = image;
-  hl_conv->filter = filter;
-  hl_conv->padding_height = padding_height;
-  hl_conv->padding_width = padding_width;
-  hl_conv->stride_height = stride_height;
-  hl_conv->stride_width = stride_width;
-  hl_conv->upscalex = 1;
-  hl_conv->upscaley = 1;
-  hl_conv->mode = mode;
-}
-
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
-  CHECK_NOTNULL(conv);
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-  CHECK_NOTNULL(hl_conv->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
-  hl_conv->desc = NULL;
-
-  free(conv);
-}
-
-void hl_convolution_forward(hl_tensor_descriptor input,
-                            real* input_data,
-                            hl_tensor_descriptor output,
-                            real* output_data,
-                            hl_filter_descriptor filter,
-                            real* filter_data,
-                            hl_convolution_descriptor conv,
-                            void* gpuWorkSpace,
-                            size_t sizeInBytes,
-                            int convFwdAlgo) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(input_data);
-  CHECK_NOTNULL(output_data);
-  CHECK_NOTNULL(filter_data);
-  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  CHECK_CUDNN(dynload::cudnnConvolutionForward(
-      t_resource.cudnn_handle,
-      &alpha,
-      src_desc,
-      input_data,
-      filter_desc,
-      filter_data,
-      conv_desc,
-      static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-      &beta,
-      dest_desc,
-      output_data));
-  CHECK_SYNC("hl_convolution_forward failed");
-}
-
-void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-                                     real* bias_data,
-                                     hl_tensor_descriptor output,
-                                     real* output_data) {
-  CHECK_NOTNULL(bias);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(bias_data);
-  CHECK_NOTNULL(output_data);
-
-  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-
-  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
-#if CUDNN_VERSION < 4000
-                                      CUDNN_ADD_SAME_C,
-#endif
-                                      &alpha,
-                                      bias_desc,
-                                      bias_data,
-                                      &beta,
-                                      output_desc,
-                                      output_data));
-  CHECK_SYNC("hl_convolution_forward_add_bias failed");
-}
-
-void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                  real* bias_grad_data,
-                                  hl_tensor_descriptor output,
-                                  real* output_grad_data) {
-  CHECK_NOTNULL(bias);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(bias_grad_data);
-  CHECK_NOTNULL(output_grad_data);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
-                                                    &alpha,
-                                                    diff_desc,
-                                                    output_grad_data,
-                                                    &beta,
-                                                    bias_desc,
-                                                    bias_grad_data));
-  CHECK_SYNC("hl_convolution_backward_bias failed");
-}
-
-void hl_convolution_backward_filter(hl_tensor_descriptor input,
-                                    real* input_data,
-                                    hl_tensor_descriptor output,
-                                    real* output_grad_data,
-                                    hl_filter_descriptor filter,
-                                    real* filter_grad_data,
-                                    hl_convolution_descriptor conv,
-                                    void* gpuWorkSpace,
-                                    size_t sizeInBytes,
-                                    int convBwdFilterAlgo) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(input_data);
-  CHECK_NOTNULL(output_grad_data);
-  CHECK_NOTNULL(filter_grad_data);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
-      t_resource.cudnn_handle,
-      &alpha,
-      src_desc,
-      input_data,
-      diff_desc,
-      output_grad_data,
-      conv_desc,
-#if CUDNN_VERSION >= 4000
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-#endif
-      &beta,
-      grad_desc,
-      filter_grad_data));
-  CHECK_SYNC("hl_convolution_backward_filter failed");
-}
-
-void hl_convolution_backward_data(hl_tensor_descriptor input,
-                                  real* input_data_grad,
-                                  hl_tensor_descriptor output,
-                                  real* output_grad_data,
-                                  hl_filter_descriptor filter,
-                                  real* filter_data,
-                                  hl_convolution_descriptor conv,
-                                  void* gpuWorkSpace,
-                                  size_t sizeInBytes,
-                                  int convBwdDataAlgo) {
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
-      t_resource.cudnn_handle,
-      &alpha,
-      filter_desc,
-      filter_data,
-      diff_desc,
-      output_grad_data,
-      conv_desc,
-#if CUDNN_VERSION >= 4000
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-#endif
-      &beta,
-      grad_desc,
-      input_data_grad));
-  CHECK_SYNC("hl_convolution_backward_data failed");
-}
-
-void hl_softmax_forward(real* input, real* output, int height, int width) {
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  height,
-                                                  width,
-                                                  1,
-                                                  1));
-
-  real alpha = 1.0f;
-  real beta = 0.0f;
-  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
-                                           CUDNN_SOFTMAX_ACCURATE,
-                                           CUDNN_SOFTMAX_MODE_CHANNEL,
-                                           &alpha,
-                                           t_resource.cudnn_desc,
-                                           input,
-                                           &beta,
-                                           t_resource.cudnn_desc,
-                                           output));
-  CHECK_SYNC("hl_softmax_forward failed");
-}
-
-void hl_softmax_backward(real* output_value,
-                         real* output_grad,
-                         int height,
-                         int width) {
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  height,
-                                                  width,
-                                                  1,
-                                                  1));
-
-  real alpha = 1.0f;
-  real beta = 0.0f;
-  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
-                                            CUDNN_SOFTMAX_ACCURATE,
-                                            CUDNN_SOFTMAX_MODE_CHANNEL,
-                                            &alpha,
-                                            t_resource.cudnn_desc,
-                                            output_value,
-                                            t_resource.cudnn_desc,
-                                            output_grad,
-                                            &beta,
-                                            t_resource.cudnn_desc,
-                                            output_grad));
-  CHECK_SYNC("hl_softmax_backward failed");
-}
-
-void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                    real* input,
-                                    hl_tensor_descriptor outputDesc,
-                                    real* output,
-                                    hl_tensor_descriptor bnParamDesc,
-                                    real* scale,
-                                    real* bias,
-                                    double factor,
-                                    real* runningMean,
-                                    real* runningInvVar,
-                                    double epsilon,
-                                    real* savedMean,
-                                    real* savedVar) {
-#if CUDNN_VERSION >= 4007
-  if ((NULL != runningMean && NULL == runningInvVar) ||
-      (NULL == runningMean && NULL != runningInvVar)) {
-    LOG(FATAL) << "runningMean and runningInvVar can be NULL "
-               << "but only at the same time.";
-  }
-  if ((NULL != savedMean && NULL == savedVar) ||
-      (NULL == savedMean && NULL != savedVar)) {
-    LOG(FATAL) << "savedMean and savedVar can be NULL "
-               << "but only at the same time.";
-  }
-
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(
-      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
-                                                      mode,
-                                                      &alpha,
-                                                      &beta,
-                                                      xDesc,
-                                                      input,
-                                                      yDesc,
-                                                      output,
-                                                      bnDesc,
-                                                      scale,
-                                                      bias,
-                                                      factor,
-                                                      runningMean,
-                                                      runningInvVar,
-                                                      epsilon,
-                                                      savedMean,
-                                                      savedVar));
-
-  CHECK_SYNC("hl_batch_norm_forward_training failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
-
-void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                     real* input,
-                                     hl_tensor_descriptor outputDesc,
-                                     real* output,
-                                     hl_tensor_descriptor bnParamDesc,
-                                     real* scale,
-                                     real* bias,
-                                     real* estimatedMean,
-                                     real* estimatedInvVar,
-                                     double epsilon) {
-#if CUDNN_VERSION >= 4007
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-
-  CHECK_CUDNN(
-      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
-                                                       mode,
-                                                       &alpha,
-                                                       &beta,
-                                                       xDesc,
-                                                       input,
-                                                       yDesc,
-                                                       output,
-                                                       bnDesc,
-                                                       scale,
-                                                       bias,
-                                                       estimatedMean,
-                                                       estimatedInvVar,
-                                                       epsilon));
-
-  CHECK_SYNC("hl_batch_norm_forward_inference failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
-
-void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                            real* input,
-                            hl_tensor_descriptor outGradDesc,
-                            real* outGrad,
-                            hl_tensor_descriptor inGradDesc,
-                            real* inGrad,
-                            hl_tensor_descriptor dBnParamDesc,
-                            real* scale,
-                            real* scaleGrad,
-                            real* biasGrad,
-                            double epsilon,
-                            real* savedMean,
-                            real* savedInvVar) {
-#if CUDNN_VERSION >= 4007
-  if ((NULL != savedMean && NULL == savedInvVar) ||
-      (NULL == savedMean && NULL != savedInvVar)) {
-    LOG(FATAL) << "savedMean and savedVar can be NULL "
-               << "but only at the same time.";
-  }
-
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t dyDesc = GET_TENSOR_DESCRIPTOR(outGradDesc);
-  cudnnTensorDescriptor_t dxDesc = GET_TENSOR_DESCRIPTOR(inGradDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(dBnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
-                                                       mode,
-                                                       &alpha,
-                                                       &beta,
-                                                       &alpha,
-                                                       &beta,
-                                                       xDesc,
-                                                       input,
-                                                       dyDesc,
-                                                       outGrad,
-                                                       dxDesc,
-                                                       inGrad,
-                                                       bnDesc,
-                                                       scale,
-                                                       scaleGrad,
-                                                       biasGrad,
-                                                       epsilon,
-                                                       savedMean,
-                                                       savedInvVar));
-
-  CHECK_SYNC("hl_batch_norm_backward failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc
deleted file mode 100644
index 92197afb3..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_device.cc
+++ /dev/null
@@ -1,681 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// clang-format off
-// Because clang-format 4.X and clang-format 3.8+ format
-// following lines in different. So disable clang-format.
-#include "hl_cuda.h"
-#include <cuda_profiler_api.h>
-#include <string.h>
-#include <sys/syscall.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include "hl_cuda.ph"
-#include "hl_thread.ph"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/DynamicLoader.h"
-// clang-format on
-
-namespace dynload {
-
-std::once_flag curand_dso_flag;
-void *curand_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load curand routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    curandStatus_t operator()(Args... args) {                                  \
-      typedef curandStatus_t (*curandFunc)(Args...);                           \
-      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    curandStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name; /* struct DynLoad__##__name */
-#endif
-
-/* include all needed curand functions in HPPL */
-// clang-format off
-#define CURAND_RAND_ROUTINE_EACH(__macro)    \
-  __macro(curandCreateGenerator)             \
-  __macro(curandSetStream)                   \
-  __macro(curandSetPseudoRandomGeneratorSeed)\
-  __macro(curandGenerateUniform)             \
-  __macro(curandGenerateUniformDouble)
-// clang-format on
-
-CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
-
-#undef CURAND_RAND_ROUTINE_EACH
-#undef DYNAMIC_LOAD_CURAND_WRAP
-
-} /* namespace dynload */
-
-/**
- * @brief   global resource.
- */
-int g_system_device_num = 0;                /* system device number */
-int device_num = 0;                         /* use    device number */
-hl_device_prop *g_device;                   /* device info table */
-__thread thread_device_resources *t_device; /* device resources table */
-int g_cuda_lib_version = 0;
-
-/* number of global stream */
-#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
-/* number of thread stream */
-#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
-/* sizeof of device memory */
-#define HPPL_GPU_MEMORY_SIZE (256 * 4)
-
-/**
- * Check build-in cuda function using glog and it **does not**
- * support << operator for more details error info.
- */
-#define CHECK_CUDA(cudaFunc)                                         \
-  do {                                                               \
-    cudaError_t cudaStat = cudaFunc;                                 \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
-                                    << cudaGetErrorString(cudaStat); \
-  } while (0)
-
-/**
- * @brief   thread resource.
- */
-__thread _hl_thread_resource t_resource = {{0},    /* stream */
-                                           0,      /* handle */
-                                           0,      /* gen */
-                                           0,      /* cudnn_handle */
-                                           0,      /* cudnn_desc */
-                                           NULL,   /* gen_mutex */
-                                           NULL,   /* gpu_mem */
-                                           NULL,   /* cpu_mem */
-                                           0,      /* event */
-                                           -1,     /* device */
-                                           0,      /* major */
-                                           false}; /* is_init */
-
-__thread cudaStream_t default_stream = 0;
-__thread bool g_sync_flag = true;
-bool hl_start_flag = false;
-
-inline pid_t gettid() {
-#if defined(__APPLE__) || defined(__OSX__)
-  // syscall is deprecated: first deprecated in macOS 10.12.
-  // syscall is unsupported;
-  // syscall pid_t tid = syscall(SYS_thread_selfid);
-  uint64_t tid;
-  pthread_threadid_np(NULL, &tid);
-#else
-#ifndef _WIN32
-#ifndef __NR_gettid
-#define __NR_gettid 224
-#endif
-  pid_t tid = syscall(__NR_gettid);
-#else   // _WIN32
-  pid_t tid = _getpid();
-#endif  // _WIN32
-#endif
-  CHECK_NE((int)tid, -1);
-  return tid;
-}
-
-void hl_init(int device) {
-  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
-
-  /* thread has been initialized */
-  if (true == t_resource.is_init) {
-    hl_set_device(device);
-    return;
-  }
-
-  /* create thread devcie resources */
-  char *tmp;
-  thread_device_resources device_res;
-  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
-                       device_num * sizeof(_thread_device_resources));
-  CHECK_NOTNULL(tmp);
-  t_device = (thread_device_resources *)tmp;
-  device_res = (thread_device_resources)(
-      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
-  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
-
-  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
-                                    sizeof(cudaStream_t));
-  CHECK_NOTNULL(tmp_stream);
-
-  int num = 0;
-  for (int dev = 0; dev < g_system_device_num; dev++) {
-    if (!g_device[dev]) {
-      continue;
-    }
-
-    t_device[dev] = &device_res[num];
-    t_device[dev]->stream =
-        (cudaStream_t *)(tmp_stream +
-                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
-
-    hl_create_thread_resources(dev, t_device[dev]);
-    num++;
-  }
-
-  hl_cudnn_desc_init(&t_resource.cudnn_desc);
-
-  /* thread initialization is complete */
-  t_resource.is_init = true;
-  /* set device */
-  t_resource.device = -1;
-  hl_set_device(device);
-}
-
-void hl_fini() {
-  if (false == t_resource.is_init) {
-    return;
-  }
-
-  /* hppl stream fini */
-  t_resource.device = -1;
-  for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
-    t_resource.stream[i] = 0;
-  }
-
-  char *tmp = (char *)t_device;
-  char *tmp_stream = NULL;
-  for (int dev = 0; dev < g_system_device_num; dev++) {
-    if (!t_device[dev]) {
-      continue;
-    }
-    if (!tmp_stream) {
-      tmp_stream = (char *)t_device[dev]->stream;
-    }
-    for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
-    }
-
-    /* free device memory */
-    hl_free_mem_device(t_device[dev]->gpu_mem);
-    hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
-  }
-
-  free(tmp);
-  free(tmp_stream);
-  t_resource.is_init = false;
-}
-
-int hl_get_device_count() { return device_num; }
-
-void hl_set_device(int device) {
-  if (device == t_resource.device) {
-    return;
-  }
-
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device: " << device << " is not specified in startup.";
-
-  CHECK_CUDA(cudaSetDevice(device));
-
-  /* switch thread stream */
-  for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
-    t_resource.stream[i] = g_device[device]->device_resources->stream[i];
-  }
-
-  if (true == t_resource.is_init) {
-    for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
-      t_resource.stream[i] =
-          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
-    }
-    t_resource.gpu_mem = t_device[device]->gpu_mem;
-    t_resource.cpu_mem = t_device[device]->cpu_mem;
-    t_resource.event = t_device[device]->mem_event;
-  }
-
-  t_resource.handle = g_device[device]->device_resources->handle;
-  t_resource.gen = g_device[device]->device_resources->gen;
-  t_resource.cudnn_handle = g_device[device]->device_resources->cudnn_handle;
-  t_resource.gen_mutex = g_device[device]->device_resources->gen_mutex;
-  t_resource.device = device;
-  t_resource.major = g_device[device]->major;
-  default_stream = t_resource.stream[0];
-}
-
-int hl_get_device() {
-  int device;
-  CHECK_CUDA(cudaGetDevice(&device));
-  return device;
-}
-
-void *hl_malloc_device(size_t size) {
-  void *dest_d;
-
-  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
-
-  return dest_d;
-}
-
-void hl_free_mem_device(void *dest_d) {
-  CHECK_NOTNULL(dest_d);
-
-  cudaError_t err = cudaFree(dest_d);
-  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-      << hl_get_device_error_string();
-}
-
-void *hl_malloc_host(size_t size) {
-  void *dest_h;
-
-  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
-
-  return dest_h;
-}
-
-void hl_free_mem_host(void *dest_h) {
-  CHECK_NOTNULL(dest_h);
-
-  cudaError_t err = cudaFreeHost(dest_h);
-  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-      << hl_get_device_error_string();
-}
-
-void hl_memcpy(void *dst, void *src, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
-}
-
-void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(cudaMemset(dest_d, value, size));
-}
-
-void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(src_h);
-  CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
-}
-
-void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dest_h);
-  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
-}
-
-void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dest_d);
-  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
-}
-
-void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
-  cudaStream_t cu_stream;
-
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_LT(stream, HPPL_STREAM_END);
-  cu_stream = t_resource.stream[stream];
-
-  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
-}
-
-void hl_start() {
-  hl_specify_devices_start(NULL, 0);
-  /* set default device */
-  hl_set_device(0);
-}
-
-bool hl_device_can_access_peer(int device, int peerDevice) {
-  int canAccessPeer;
-  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
-
-  if (canAccessPeer == 1) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
-  if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    cudaGetLastError();
-  } else {
-    CHECK_CUDA(err);
-  }
-}
-
-void hl_create_global_resources(hl_device_prop device_prop) {
-  struct cudaDeviceProp cu_prop;
-  int device = device_prop->device;
-  global_device_resources device_res = device_prop->device_resources;
-
-  CHECK_CUDA(cudaSetDevice(device));
-  /* device properties */
-  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
-
-  device_prop->major = cu_prop.major;
-  device_prop->minor = cu_prop.minor;
-  strncpy(device_prop->device_name, cu_prop.name, 256);
-  device_prop->device_mem = cu_prop.totalGlobalMem;
-
-  /* create device stream */
-  for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
-  }
-
-  /* cublas init */
-  hl_cublas_init(&device_res->handle, device_res->stream[0]);
-
-  /* create curand gen */
-  CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
-                                          CURAND_RNG_PSEUDO_DEFAULT),
-           CURAND_STATUS_SUCCESS)
-      << "[Start failed] Curand init failed.";
-
-  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
-           CURAND_STATUS_SUCCESS)
-      << "[Start failed] Curand set stream failed!";
-
-  /* create cudnn handle */
-  hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
-
-  int seed = gettid();
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
-                                                       seed + device),
-           CURAND_STATUS_SUCCESS);
-
-  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
-  pthread_mutex_init(device_res->gen_mutex, NULL);
-
-  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
-}
-
-int hl_get_cuda_version() { return g_cuda_lib_version; }
-
-void hl_create_thread_resources(int device,
-                                thread_device_resources device_res) {
-  CHECK_CUDA(cudaSetDevice(device));
-
-  /* create thread stream */
-  for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
-  }
-
-  /* allocation device memory */
-  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
-
-  /* allocation host memory */
-  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
-
-  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
-}
-
-void hl_specify_devices_start(int *device, int number) {
-  if (hl_start_flag) return;
-
-  /* 1. get the number of devices */
-  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
-  CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
-  if (device == NULL) {
-    number = g_system_device_num;
-  }
-
-  /* 2. check device & create device property table */
-  CHECK_LE(number, g_system_device_num)
-      << "[Start failed] System does not have enough device. "
-      << "Device number: " << g_system_device_num << "Input number: " << number;
-
-  char *tmp;
-  hl_device_prop device_prop;
-  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
-                       number * sizeof(_hl_device_prop));
-  CHECK(tmp) << "[Start failed] System memory is not enough.";
-
-  g_device = (hl_device_prop *)tmp;
-  device_prop = (hl_device_prop)(
-      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
-  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
-  int num = 0;
-  for (int i = 0; i < number; i++) {
-    int dev;
-    if (device == NULL) {
-      dev = i;
-    } else {
-      dev = device[i];
-    }
-
-    CHECK_LT(dev, g_system_device_num)
-        << "[Start failed] The specified device number is "
-        << "out of range. Max device number: " << g_system_device_num - 1
-        << " Specified devcie number: " << dev;
-
-    if (g_device[dev]) {
-      /* Warning */
-      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
-      continue;
-    }
-
-    g_device[dev] = &device_prop[num];
-    g_device[dev]->device = dev;
-    num++;
-  }
-  device_num = num;
-
-  /* 3.  create global device resources */
-  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
-  CHECK_NOTNULL(tmp_res);
-
-  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
-                                    sizeof(cudaStream_t));
-  CHECK_NOTNULL(tmp_stream);
-
-  num = 0;
-  for (int i = 0; i < g_system_device_num; i++) {
-    if (!g_device[i]) {
-      continue;
-    }
-
-    g_device[i]->device_resources = (global_device_resources)(
-        tmp_res + num * sizeof(_global_device_resources));
-    g_device[i]->device_resources->stream =
-        (cudaStream_t *)(tmp_stream +
-                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
-
-    hl_create_global_resources(g_device[i]);
-    num++;
-  }
-
-  /* hl_start() is ok */
-  hl_start_flag = true;
-  /* set default device */
-  if (device == NULL) {
-    hl_set_device(0);
-  } else {
-    hl_set_device(device[0]);
-  }
-}
-
-void hl_rand(real *dest_d, size_t num) {
-  pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(
-#ifndef PADDLE_TYPE_DOUBLE
-      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
-#else
-      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
-#endif
-      CURAND_STATUS_SUCCESS);
-  pthread_mutex_unlock(t_resource.gen_mutex);
-  CHECK_SYNC("hl_rand failed");
-}
-
-void hl_srand(unsigned int seed) {
-  pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
-           CURAND_STATUS_SUCCESS);
-  pthread_mutex_unlock(t_resource.gen_mutex);
-}
-
-void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
-
-bool hl_get_sync_flag() { return g_sync_flag; }
-
-void hl_stream_synchronize(hl_stream_t stream) {
-  cudaStream_t cu_stream;
-
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
-}
-
-void hl_create_event(hl_event_t *event) {
-  CHECK_NOTNULL(event);
-
-  struct _hl_event_st *st_event =
-      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
-
-  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
-
-  *event = st_event;
-}
-
-float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
-  float time;
-  CHECK_NOTNULL(start);
-  CHECK_NOTNULL(end);
-
-  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
-  return time;
-}
-
-void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
-  cudaStream_t cu_stream;
-
-  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
-}
-
-void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
-  cudaStream_t cu_stream;
-
-  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
-}
-
-void hl_destroy_event(hl_event_t event) {
-  CHECK_NOTNULL(event);
-  CHECK_CUDA(cudaEventDestroy(event->cu_event));
-
-  free(event);
-  event = NULL;
-}
-
-void hl_event_synchronize(hl_event_t event) {
-  CHECK_NOTNULL(event);
-  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
-}
-
-void hl_get_device_name(char *name, int len, int device) {
-  CHECK_NOTNULL(name);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  strncpy(name, g_device[device]->device_name, len);
-}
-
-void hl_get_device_memory(size_t *mem_size, int device) {
-  CHECK_NOTNULL(mem_size);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  *mem_size = g_device[device]->device_mem;
-}
-
-void hl_get_device_compute_capability(int *major, int *minor, int device) {
-  CHECK_NOTNULL(major);
-  CHECK_NOTNULL(minor);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  *major = g_device[device]->major;
-  *minor = g_device[device]->minor;
-}
-
-int hl_get_device_last_error() { return (int)cudaGetLastError(); }
-
-const char *hl_get_device_error_string() {
-  cudaError_t err = cudaGetLastError();
-  return cudaGetErrorString(err);
-}
-
-const char *hl_get_device_error_string(size_t err) {
-  return cudaGetErrorString((cudaError_t)err);
-}
-
-void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
-void hl_set_device_flags_block() {
-  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
-}
-
-bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = cudaEventQuery(event->cu_event);
-  CHECK(cudaSuccess == err || cudaErrorNotReady == err);
-
-  if (cudaErrorNotReady == err) {
-    return false;
-  }
-  return true;
-}
-
-void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
-
-void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/legacy/cuda/src/hl_cuda_lstm.cu b/paddle/legacy/cuda/src/hl_cuda_lstm.cu
deleted file mode 100644
index 9ac564fd2..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_lstm.cu
+++ /dev/null
@@ -1,876 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_activation_functions.h"
-#include "hl_base.h"
-#include "hl_cuda_cublas.h"
-#include "hl_device_functions.cuh"
-#include "paddle/legacy/utils/Logging.h"
-
-typedef hppl::Active<real>::forward t_forward;
-typedef hppl::Active<real>::backward t_backward;
-
-bool hl_lstm_sequence_parallel(int frameSize) {
-  if (frameSize == 32 || frameSize == 64) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-class frameValue {
- public:
-  real *value_;
-  __device__ frameValue(real *value) : value_(value) {}
-  template <int reversed, int frameSize>
-  __device__ inline void init(int start, int length, int idx) {
-    if (reversed == 0) {
-      value_ += start * frameSize + idx;
-    } else {
-      value_ += (start + length - 1) * frameSize + idx;
-    }
-  }
-  __device__ inline real *getPtr() const { return value_; }
-  __device__ inline real getValue() { return *value_; }
-  __device__ inline void setValue(real value) { *value_ = value; }
-  template <int reversed, int frameSize>
-  __device__ inline void nextFrame() {
-    if (reversed == 0) {
-      value_ += frameSize;
-    } else {
-      value_ -= frameSize;
-    }
-  }
-};
-
-__device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
-  asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
-}
-
-__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
-  asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
-}
-
-template <int valueSize, int frameSize>
-__device__ __forceinline__ real forward_sequence(real value,
-                                                 real *shValue,
-                                                 real *state,
-                                                 real *preOutput,
-                                                 real *output,
-                                                 real check,
-                                                 int index,
-                                                 t_forward activeNode,
-                                                 t_forward activeGate,
-                                                 t_forward activeState) {
-  real out;
-  real prevOut;
-  real state_r;
-  const int idx = index % frameSize;
-  const int idy = index / frameSize;
-  // assert(index < valueSize);
-
-  if (idy == 0) {
-    value = activeNode(value);
-    shValue[index] = value;
-  }
-  if (idy == 1 || idy == 2) {
-    state_r = state[idx];
-    value += state_r * check;
-    value = activeGate(value);
-    shValue[index] = value;
-  }
-  ptx_sync(1, valueSize);
-  if (idy == 3) {
-    state_r = state[idx];
-    state_r = state_r * shValue[idx + frameSize * 2];
-    state_r += shValue[idx] * shValue[idx + frameSize];
-    state[idx] = state_r;
-    ptx_arrive(2, frameSize * 2);
-    value += state_r * check;
-    value = activeGate(value);
-    shValue[index] = value;
-    ptx_sync(3, frameSize * 2);
-    prevOut = preOutput[idx];
-    out = prevOut * value;
-    output[idx] = out;
-  }
-  if (idy == 0) {
-    ptx_sync(2, frameSize * 2);
-    prevOut = state[idx];
-    prevOut = activeState(prevOut);
-    preOutput[idx] = prevOut;
-    ptx_arrive(3, frameSize * 2);
-  }
-  return value;
-}
-
-#define OUTPUT_BARRIER_ID 10
-#define OUTPUT_BARRIER_ID2 11
-template <int valueSize,
-          int frameSize,
-          int reversed,
-          int computeThreads,
-          int blockSize>
-__global__ void KeLstmForward(real *gateValue,
-                              real *state,
-                              real *output,
-                              real *preOutput,
-                              real *checkIg,
-                              real *checkFg,
-                              real *checkOg,
-                              real *weight,
-                              const int *starts,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  __shared__ real shValue[valueSize];
-  __shared__ real shState[frameSize];
-  __shared__ real shPrevOutput[frameSize];
-  __shared__ real shOutput[frameSize];
-
-  const int index = threadIdx.x;
-  int start = starts[blockIdx.x];
-  int length = starts[blockIdx.x + 1] - start;
-
-  /* init */
-  real check;
-  real value;
-  frameValue frameGate(gateValue);
-  frameValue frameState(state);
-  frameValue frameOutput(output);
-  frameValue framePreOutput(preOutput);
-  if (index < valueSize) {
-    const int idx = index % frameSize;
-    const int idy = index / frameSize;
-    frameGate.init<reversed, valueSize>(start, length, index);
-    value = frameGate.getValue();
-    if (idy == 0) {
-      shState[idx] = 0.0;
-    } else if (idy == 1) {
-      check = checkIg[idx];
-    } else if (idy == 2) {
-      check = checkFg[idx];
-    } else if (idy == 3) {
-      check = checkOg[idx];
-    }
-
-    if (idy == 3) {
-      frameState.init<reversed, frameSize>(start, length, idx);
-      frameOutput.init<reversed, frameSize>(start, length, idx);
-      framePreOutput.init<reversed, frameSize>(start, length, idx);
-    }
-
-    ptx_sync(1, valueSize);
-  }
-
-  for (int i = 0; i < length; ++i) {
-    if (index < valueSize) {
-      if (valueSize == 128) {
-        if (i != 0) {
-          ptx_sync(OUTPUT_BARRIER_ID2, blockSize);
-          value += shValue[index];
-        }
-      }
-      value = forward_sequence<valueSize, frameSize>(
-          value,
-          shValue,
-          shState,
-          shPrevOutput,
-          shOutput,
-          check,
-          index,
-          hppl::gpu::forward[active_node],
-          hppl::gpu::forward[active_gate],
-          hppl::gpu::forward[active_state]);
-      const int idx = index % frameSize;
-      const int idy = index / frameSize;
-      if (valueSize == 128) {
-        if (idy == 3) {
-          ptx_arrive(OUTPUT_BARRIER_ID, frameSize + 128);
-        }
-      }
-      if (valueSize == 256) {
-        ptx_sync(OUTPUT_BARRIER_ID, valueSize);
-      }
-      frameGate.setValue(value);
-      if (idy == 3) {
-        frameState.setValue(shState[idx]);
-        frameOutput.setValue(shOutput[idx]);
-        framePreOutput.setValue(shPrevOutput[idx]);
-        frameState.nextFrame<reversed, frameSize>();
-        frameOutput.nextFrame<reversed, frameSize>();
-        framePreOutput.nextFrame<reversed, frameSize>();
-      }
-      if (i != length - 1) {
-        frameGate.nextFrame<reversed, valueSize>();
-        value = frameGate.getValue();
-      }
-    }
-    if (i != length - 1) {
-      if (valueSize == 128) {
-        if (valueSize <= index) {
-          real B_r[frameSize];
-          const int computeIdx = index - valueSize;
-          if (i == 0) {
-#pragma unroll
-            for (int n = 0; n < frameSize; n++) {
-              B_r[n] = weight[n * valueSize + computeIdx];
-            }
-          }
-          ptx_sync(OUTPUT_BARRIER_ID, frameSize + 128);
-          real A_r[frameSize];
-          for (int n = 0; n < frameSize; n++) {
-            A_r[n] = shOutput[n];
-          }
-          real sum = 0.0f;
-          for (int n = 0; n < frameSize; n++) {
-            sum += A_r[n] * B_r[n];
-          }
-          shValue[computeIdx] = sum;
-          ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
-        }
-      }
-      if (valueSize == 256) {
-        real B_r[frameSize];
-        if (i == 0) {
-#pragma unroll
-          for (int n = 0; n < frameSize; n++) {
-            B_r[n] = weight[n * valueSize + index];
-          }
-        }
-        real sum = 0.0f;
-        for (int n = 0; n < frameSize; n++) {
-          sum += shOutput[n] * B_r[n];
-        }
-        value += sum;
-      }
-    }
-  }
-}
-
-void hl_lstm_parallel_forward(real *gateValue,
-                              real *stateValue,
-                              real *preOutputValue,
-                              real *outputValue,
-                              real *checkIg,
-                              real *checkFg,
-                              real *checkOg,
-                              real *weight,
-                              const int *sequence,
-                              int frameSize,
-                              int numSequences,
-                              bool reversed,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64);
-  dim3 grid(numSequences, 1);
-  if (!reversed) {
-    if (frameSize == 32) {
-      KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  } else {
-    if (frameSize == 32) {
-      KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  }
-  CHECK_SYNC("hl_lstm_parallel_forward failed");
-}
-
-__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-  const int warp_size = 32;
-  int addr = idx % warp_size;
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, addr < warp_size);
-#pragma unroll
-  for (int k = 1; k < 32; k++) {
-    // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
-    a[k] = __shfl_sync(mask, a[k], addr, 32);
-  }
-
-#pragma unroll
-  for (int tid = 0; tid < 31; tid++) {
-    real tmp = (idx > tid) ? a[0] : a[1];
-#pragma unroll
-    for (int k = 31; k > 0; k--) {
-      a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
-    }
-    a[1] = tmp;
-  }
-
-  addr = (32 - idx) % 32;
-  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
-#pragma unroll
-  for (int k = 0; k < 32; k++) {
-    a[k] = __shfl_sync(mask, a[k], addr, 32);
-    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
-  }
-}
-
-template <int valueSize, int frameSize>
-__device__ void backward_sequence(real rGateValue,
-                                  real rOutputGrad,
-                                  real rPreOutputValue,
-                                  real &rGateGrad,
-                                  real &rStateGrad,
-                                  real *shStateGrad,
-                                  real *shStateValue,
-                                  real *shGateValue,
-                                  real rCheck,
-                                  real &rGateValuePrev,
-                                  int index,
-                                  t_backward activeNode,
-                                  t_backward activeGate,
-                                  t_backward activeState) {
-  const int frameIdx = index % frameSize;
-  const int frameIdy = index / frameSize;
-  if (frameIdy == 3) {
-    real rPrevOutputGrad;
-    rPrevOutputGrad = rOutputGrad * rGateValue;
-    rStateGrad = activeState(rPrevOutputGrad, rPreOutputValue);
-    rGateGrad = rOutputGrad * rPreOutputValue;
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-    rStateGrad += rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_arrive(3, valueSize);
-  } else if (frameIdy == 1) {
-    shGateValue[frameIdx + frameSize] = rGateValue;
-    rStateGrad = rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateGrad = rStateGrad * shGateValue[frameIdx];
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-  } else if (frameIdy == 2) {
-    rStateGrad = rStateGrad * rGateValuePrev;
-    rStateGrad += rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateValuePrev = rGateValue;
-    rGateGrad = rStateGrad * shStateValue[frameIdx];
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-  } else if (frameIdy == 0) {
-    shGateValue[frameIdx] = rGateValue;
-    ptx_sync(3, valueSize);
-    rStateGrad = shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
-    rGateGrad = activeNode(rGateGrad, rGateValue);
-  }
-}
-
-template <int valueSize, int frameSize>
-__device__ void load_weight(real rWeight[], real *weight, const int index) {
-  if (valueSize == 128) {
-    weight += index;
-#pragma unroll
-    for (int n = 0; n < frameSize; n++) {
-      rWeight[n] = weight[n * valueSize];
-    }
-    transpose_32x32(rWeight, index % 32);
-  }
-  if (valueSize == 256) {
-    int id = (index / 32) % 2;
-    weight += index - id * 32 + id * 32 * valueSize;
-#pragma unroll
-    for (int n = 0; n < 32; n++) {
-      rWeight[n] = weight[n * valueSize];
-      rWeight[n + 32] = weight[n * valueSize + 32];
-    }
-    transpose_32x32(rWeight, index % 32);
-    transpose_32x32(&rWeight[32], index % 32);
-  }
-}
-
-template <int valueSize, int frameSize, int reversed>
-__global__ void KeLstmBackward(real *gateValue,
-                               real *gateGrad,
-                               real *stateValue,
-                               real *stateGrad, /* do not need save */
-                               real *preOutputValue,
-                               real *preOutputGrad, /* do not need save */
-                               real *checkIg,
-                               real *checkIgGrad,
-                               real *checkFg,
-                               real *checkFgGrad,
-                               real *checkOg,
-                               real *checkOgGrad,
-                               real *outputGrad,
-                               real *weightValue,
-                               const int *starts,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate,
-                               hl_activation_mode_t active_state) {
-  __shared__ real shGateValue[valueSize];
-  __shared__ real shStateGrad[valueSize];
-  __shared__ real shStateValue[frameSize];
-  __shared__ real shGateGrad[4][frameSize];
-  __shared__ real shOutputGrad[4][frameSize];
-  const int index = threadIdx.x;
-  int start = starts[blockIdx.x];
-  int length = starts[blockIdx.x + 1] - start;
-
-  const int frameIdx = index % frameSize;
-  const int frameIdy = index / frameSize;
-  real rCheck;
-  real rCheckGrad;
-  real rGateGrad;
-  real rStateGrad;
-  real rGateValuePrev;
-  real rPreOutputValue;
-  real rOutputGrad;
-  real rGateValue;
-  real rStateValue;
-
-  frameValue frameGateValue(gateValue);
-  frameValue frameGateGrad(gateGrad);
-  frameValue framePreOutputValue(preOutputValue);
-  frameValue frameStateValue(stateValue);
-  frameValue frameOutputGrad(outputGrad);
-  if (frameIdy == 0) {
-  } else if (frameIdy == 1) {
-    rCheck = checkIg[frameIdx];
-  } else if (frameIdy == 2) {
-    rCheck = checkFg[frameIdx];
-    rGateValuePrev = 0.0;
-    rStateGrad = 0.0;
-  } else if (frameIdy == 3) {
-    rCheck = checkOg[frameIdx];
-    framePreOutputValue.init<!reversed, frameSize>(start, length, frameIdx);
-    frameOutputGrad.init<!reversed, frameSize>(start, length, frameIdx);
-    rOutputGrad = frameOutputGrad.getValue();
-    rPreOutputValue = framePreOutputValue.getValue();
-    frameStateValue.init<!reversed, frameSize>(start, length, frameIdx);
-    rStateValue = frameStateValue.getValue();
-  }
-
-  frameGateValue.init<!reversed, valueSize>(start, length, index);
-  frameGateGrad.init<!reversed, valueSize>(start, length, index);
-  rGateValue = frameGateValue.getValue();
-  rGateGrad = 0.0;
-  rCheckGrad = 0.0;
-
-  real B_r[frameSize];
-  load_weight<valueSize, frameSize>(B_r, weightValue, index);
-
-  for (int i = 0; i < length; ++i) {
-    if (frameIdy == 3) {
-      if (i != length - 1) {
-        frameStateValue.nextFrame<!reversed, frameSize>();
-        shStateValue[frameIdx] = frameStateValue.getValue();
-      } else {
-        shStateValue[frameIdx] = 0.0;
-      }
-    }
-    backward_sequence<valueSize, frameSize>(rGateValue,
-                                            rOutputGrad,
-                                            rPreOutputValue,
-                                            rGateGrad,
-                                            rStateGrad,
-                                            shStateGrad,
-                                            shStateValue,
-                                            shGateValue,
-                                            rCheck,
-                                            rGateValuePrev,
-                                            index,
-                                            hppl::gpu::backward[active_node],
-                                            hppl::gpu::backward[active_gate],
-                                            hppl::gpu::backward[active_state]);
-    if (frameIdy == 3) {
-      rCheckGrad += rGateGrad * rStateValue;
-      rStateValue = shStateValue[frameIdx];
-    }
-
-    frameGateGrad.setValue(rGateGrad);
-    frameGateGrad.nextFrame<!reversed, valueSize>();
-
-    if (i != length - 1) {
-      if (frameIdy == 3) {
-        framePreOutputValue.nextFrame<!reversed, frameSize>();
-        rPreOutputValue = framePreOutputValue.getValue();
-        frameOutputGrad.nextFrame<!reversed, frameSize>();
-        rOutputGrad = frameOutputGrad.getValue();
-      } else if (frameIdy == 2) {
-        rCheckGrad += rGateGrad * shStateValue[frameIdx];
-      } else if (frameIdy == 1) {
-        rCheckGrad += rGateGrad * shStateValue[frameIdx];
-      }
-
-      frameGateValue.nextFrame<!reversed, valueSize>();
-      rGateValue = frameGateValue.getValue();
-      shGateGrad[frameIdy][frameIdx] = rGateGrad;
-      if (valueSize == 128) {
-        real sum = 0.0f;
-#pragma unroll
-        for (int n = 0; n < frameSize; n++) {
-          sum += shGateGrad[frameIdy][n] * B_r[n];
-        }
-        if (frameIdy == 3) {
-          rOutputGrad += sum;
-        } else {
-          shOutputGrad[frameIdy][frameIdx] = sum;
-        }
-      }
-      if (valueSize == 256) {
-        ptx_sync(5, valueSize);
-        real A_r[frameSize];
-        for (int n = 0; n < frameSize; n++) {
-          A_r[n] = shGateGrad[frameIdy][n];
-        }
-        real sum = 0.0f;
-        for (int n = 0; n < frameSize; n++) {
-          sum += A_r[n] * B_r[n];
-        }
-        if (frameIdy == 3) {
-          rOutputGrad += sum;
-        } else {
-          shOutputGrad[frameIdy][frameIdx] = sum;
-        }
-      }
-
-      if (frameIdy == 3) {
-        ptx_sync(6, valueSize);
-#pragma unroll
-        for (int i = 0; i < 3; i++) {
-          rOutputGrad += shOutputGrad[i][frameIdx];
-        }
-      } else {
-        ptx_arrive(6, valueSize);
-      }
-    }
-  }
-
-  /* TODO: Temporary save & merger in another kernel */
-  if (frameIdy == 1) {
-    if (checkIgGrad)
-      paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
-  } else if (frameIdy == 2) {
-    if (checkFgGrad)
-      paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
-  } else if (frameIdy == 3) {
-    if (checkOgGrad)
-      paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
-  }
-}
-
-void hl_lstm_parallel_backward_data(real *gateValue,
-                                    real *gateGrad,
-                                    real *stateValue,
-                                    real *stateGrad,
-                                    real *preOutputValue,
-                                    real *preOutputGrad,
-                                    real *outputGrad,
-                                    real *checkIg,
-                                    real *checkIgGrad,
-                                    real *checkFg,
-                                    real *checkFgGrad,
-                                    real *checkOg,
-                                    real *checkOgGrad,
-                                    real *weight,
-                                    const int *sequence,
-                                    int frameSize,
-                                    int numSequences,
-                                    bool reversed,
-                                    hl_activation_mode_t active_node,
-                                    hl_activation_mode_t active_gate,
-                                    hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
-        frameSize == 256);
-  dim3 grid(numSequences, 1);
-  if (!reversed) {
-    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  } else {
-    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  }
-  CHECK_SYNC("hl_lstm_parallel_backward_data");
-}
-
-template <int B_X, int B_Y>
-__global__ void KeSetGradZero(real *gateGrad,
-                              const int *starts,
-                              int valueSize,
-                              int numSequences,
-                              bool reversed) {
-  // const int tid = threadIdx.x;
-
-  const int frameIdx = blockIdx.x * B_X + threadIdx.x;
-  const int numSeqId = blockIdx.y * B_Y + threadIdx.y;
-
-  if (numSeqId >= numSequences || frameIdx >= valueSize) return;
-
-  if (!reversed) {
-    int seqId = starts[numSeqId];
-    gateGrad[seqId * valueSize + frameIdx] = 0.0;
-  } else {
-    int seqId = starts[numSeqId + 1] - 1;
-    gateGrad[seqId * valueSize + frameIdx] = 0.0;
-  }
-}
-
-void hl_lstm_parallel_backward_weight(real *weightGrad,
-                                      real *outputValue,
-                                      real *gateGrad,
-                                      const int *sequence,
-                                      int frameSize,
-                                      int batchSize,
-                                      int numSequences,
-                                      bool reversed) {
-  int valueSize = 4 * frameSize;
-  dim3 threads(32, 32);
-  dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
-  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      gateGrad, sequence, valueSize, numSequences, reversed);
-
-  if (!reversed) {
-    hl_matrix_mul(outputValue,
-                  HPPL_OP_T,
-                  gateGrad + valueSize,
-                  HPPL_OP_N,
-                  weightGrad,
-                  frameSize,
-                  valueSize,
-                  batchSize - 1,
-                  1.0,
-                  1.0);
-  } else {
-    hl_matrix_mul(outputValue + frameSize,
-                  HPPL_OP_T,
-                  gateGrad,
-                  HPPL_OP_N,
-                  weightGrad,
-                  frameSize,
-                  valueSize,
-                  batchSize - 1,
-                  1.0,
-                  1.0);
-  }
-  CHECK_SYNC("hl_lstm_parallel_backward_weight");
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_matrix.cu b/paddle/legacy/cuda/src/hl_cuda_matrix.cu
deleted file mode 100644
index 6fe460026..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_matrix.cu
+++ /dev/null
@@ -1,806 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
-#include "hl_matrix.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "hl_sequence.h"
-#include "hl_sparse.ph"
-#include "paddle/legacy/utils/Logging.h"
-
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
-void hl_matrix_add(real* A_d,
-                   real* B_d,
-                   real* C_d,
-                   int dimM,
-                   int dimN,
-                   real alpha,
-                   real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
-      ternary::_add<real>(alpha, beta),
-      A_d,
-      B_d,
-      C_d,
-      dimM,
-      dimN,
-      dimN,
-      dimN,
-      dimN);
-  CHECK_SYNC("hl_matrix_add failed");
-}
-
-#ifdef PADDLE_TYPE_DOUBLE
-#define THRESHOLD 128
-#else
-#define THRESHOLD 64
-#endif
-__device__ __forceinline__ void findMax(real* I,
-                                        real* dfMax_s,
-                                        int blockSize,
-                                        int base,
-                                        int curIdx,
-                                        int nextIdx,
-                                        int dimN,
-                                        real* max) {
-  dfMax_s[base] = -1.0e20;
-  while (curIdx < dimN) {
-    if (dfMax_s[base] < I[nextIdx]) {
-      dfMax_s[base] = I[nextIdx];
-    }
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
-    __syncthreads();
-    if (base < stride) {
-      nextIdx = base + stride;
-      if (dfMax_s[base] < dfMax_s[nextIdx]) {
-        dfMax_s[base] = dfMax_s[nextIdx];
-      }
-    }
-  }
-
-  if (0 == base) {
-    max[0] = dfMax_s[0];
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void subMaxAndExp(real* I,
-                                             real* O,
-                                             int curIdx,
-                                             int nextIdx,
-                                             int blockSize,
-                                             int dimN,
-                                             real max) {
-  real val;
-  while (curIdx < dimN) {
-    val = I[nextIdx] - max;
-    if (val < -THRESHOLD) {
-      val = -THRESHOLD;
-    }
-    I[nextIdx] = val;
-#ifndef PADDLE_TYPE_DOUBLE
-    O[nextIdx] = __expf(val);
-#else
-    O[nextIdx] = exp(val);
-#endif
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void valueSum(real* O,
-                                         real* dfMax_s,
-                                         int blockSize,
-                                         int base,
-                                         int curIdx,
-                                         int nextIdx,
-                                         int dimN) {
-  dfMax_s[base] = 0;
-  while (curIdx < dimN) {
-    dfMax_s[base] += O[nextIdx];
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
-    __syncthreads();
-    if (base < stride) {
-      nextIdx = base + stride;
-      dfMax_s[base] += dfMax_s[nextIdx];
-    }
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void divSum(
-    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
-  while (curIdx < dimN) {
-    O[nextIdx] /= sum;
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-}
-
-__device__ __forceinline__ void softmax(real* I,
-                                        real* O,
-                                        real* dfMax_s,
-                                        int blockSize,
-                                        int base,
-                                        int curIdx,
-                                        int nextIdx,
-                                        int dimN) {
-  __shared__ real max;
-
-  // find the max number
-  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
-
-  // sub max Value and do Exp operation
-  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
-
-  // add dimN values into blockDim.x buffer
-  // sum is in dfMax_s[0]
-  valueSum(O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-
-  // divided by sum
-  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
-}
-
-template <int blockSize>
-__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
-  int base = threadIdx.x;
-  __shared__ real dfMax_s[blockSize];
-  int nextIdx = blockIdx.x * dimN + base;
-  int curIdx = base;
-
-  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-}
-
-void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  dim3 block(512, 1);
-  dim3 grid(dimM, 1);
-  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
-  CHECK_SYNC("hl_matrix_softmax failed");
-}
-
-template <int blockSize>
-__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
-  int base = threadIdx.x;
-  int bid = blockIdx.x;
-  __shared__ real dfMax_s[blockSize];
-
-  int start = index[bid];
-  int dimN = index[bid + 1] - start;
-
-  int nextIdx = start + base;
-  int curIdx = base;
-
-  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-}
-
-void hl_sequence_softmax_forward(real* A_d,
-                                 real* C_d,
-                                 const int* index,
-                                 int numSequence) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  dim3 block(512, 1);
-  dim3 grid(numSequence, 1);
-  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
-  CHECK_SYNC("hl_sequence_softmax_forward failed");
-}
-
-__global__ void KeMatrixDerivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  int index;
-
-  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx * dimN + colIdx;
-    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
-  }
-}
-
-void hl_matrix_softmax_derivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-  CHECK_NOTNULL(grad_d);
-  CHECK_NOTNULL(output_d);
-  CHECK_NOTNULL(sftmaxSum_d);
-
-  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 - 1) / 1024;
-  dim3 threads(1, 1024);
-  dim3 grid(blocksX, blocksY);
-
-  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_d, output_d, sftmaxSum_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_softmax_derivative failed");
-}
-
-__global__ void KeMatrixMultiBinaryCrossEntropy(
-    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < dimM) {
-    for (int i = 0; i < dimN; i++) {
-      entropy[index] -= log(1 - output[index * dimN + i]);
-    }
-    int* row_col = col + row[index];
-    int col_num = row[index + 1] - row[index];
-    for (int i = 0; i < col_num; i++) {
-      real o = output[index * dimN + row_col[i]];
-      entropy[index] -= log(o / (1 - o));
-    }
-  }
-}
-
-void hl_matrix_multi_binary_cross_entropy(real* output,
-                                          real* entropy,
-                                          hl_sparse_matrix_s csr_mat,
-                                          int dimM,
-                                          int dimN) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(entropy);
-  CHECK_NOTNULL(csr_mat);
-  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
-  int n_threads = 1024;
-  int blocks = (dimM + n_threads - 1) / n_threads;
-  dim3 threads(n_threads);
-  dim3 grid(blocks);
-  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
-  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
-}
-
-__global__ void KeMatrixMultiBinaryCrossEntropyBp(
-    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row_idx < dimM) {
-    for (int i = 0; i < dimN; i++) {
-      int index = row_idx * dimN + i;
-      grad[index] += 1.0 / (1 - output[index]);
-    }
-    int col_num = row[row_idx + 1] - row[row_idx];
-    int* row_col = col + row[row_idx];
-    for (int i = 0; i < col_num; i++) {
-      int index = row_idx * dimN + row_col[i];
-      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
-    }
-  }
-}
-
-void hl_matrix_multi_binary_cross_entropy_bp(
-    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(csr_mat);
-  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
-  int n_threads = 1024;
-  int blocks = (dimM + n_threads - 1) / n_threads;
-  dim3 threads(n_threads);
-  dim3 grid(blocks);
-  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
-  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
-}
-
-__global__ void KeMatrixCrossEntropy(
-    real* O, real* E, int* label, int dimM, int dimN) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int newBase;
-  if (index < dimM) {
-    newBase = label[index];
-    newBase = newBase % dimN;
-    E[index] = -log(O[index * dimN + newBase]);
-  }
-}
-
-void hl_matrix_cross_entropy(
-    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  int blocks = (dimM + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, C_d, label_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_cross_entropy failed");
-}
-
-__global__ void KeMatrixCrossEntropyBp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  int index;
-  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx * dimN + colIdx;
-    if (label_d[rowIdx] == colIdx) {
-      grad_d[index] -= 1.0f / output_d[index];
-    }
-  }
-}
-
-void hl_matrix_cross_entropy_bp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-  CHECK_NOTNULL(grad_d);
-  CHECK_NOTNULL(output_d);
-  CHECK_NOTNULL(label_d);
-
-  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 - 1) / 1024;
-  dim3 threads(1, 1024);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_d, output_d, label_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
-}
-
-void hl_matrix_zero_mem(real* data, int num) {
-  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
-}
-
-__global__ void KeParamReluForward(real* output,
-                                   real* input,
-                                   real* w,
-                                   int width,
-                                   int height,
-                                   int partial_sum) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ty = blockIdx.y * blockDim.y + threadIdx.y;
-  if (tx < width && ty < height) {
-    int index = ty * width + tx;
-    output[index] =
-        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
-  }
-}
-
-void hl_param_relu_forward(real* output,
-                           real* input,
-                           real* w,
-                           int width,
-                           int height,
-                           int partial_sum) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(w);
-  dim3 threads(16, 16);
-  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 - 1) / 16;
-  dim3 grid(blockX, blockY);
-  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, input, w, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_forward failed");
-}
-
-template <int blockSize>
-__global__ void KeParamReluBackWardW(real* grad_w,
-                                     real* grad_o,
-                                     real* input,
-                                     int width,
-                                     int height,
-                                     int partial_sum) {
-  const int tid = threadIdx.x;
-  __shared__ real temp[blockSize];
-  grad_o += partial_sum * blockIdx.x;
-  input += partial_sum * blockIdx.x;
-  real tmp = 0.0;
-  for (int index = tid; index < partial_sum * height; index += blockSize) {
-    int row = index / partial_sum;
-    int offset = row * width + (index - row * partial_sum);
-    if (input[offset] < 0) {
-      tmp += grad_o[offset] * input[offset];
-    }
-  }
-  temp[tid] = tmp;
-  __syncthreads();
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      temp[tid] += temp[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    grad_w[blockIdx.x] += temp[0];
-  }
-}
-
-void hl_param_relu_backward_w(real* grad_w,
-                              real* grad_o,
-                              real* input,
-                              int width,
-                              int height,
-                              int partial_sum) {
-  CHECK_NOTNULL(grad_w);
-  CHECK_NOTNULL(grad_o);
-  CHECK_NOTNULL(input);
-  const int blockSize = 1024;
-  int grid_num = width / partial_sum;
-  dim3 threads(blockSize, 1);
-  dim3 grid(grid_num, 1);
-  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_w, grad_o, input, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_backward_w failed");
-}
-
-__global__ void KeParamReluBackwardDiff(real* grad_o,
-                                        real* input,
-                                        real* w,
-                                        real* diff,
-                                        int width,
-                                        int height,
-                                        int partial_sum) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ty = blockIdx.y * blockDim.y + threadIdx.y;
-  if (tx < width && ty < height) {
-    int index = ty * width + tx;
-    diff[index] += grad_o[index] * (input[index] > 0 ? 1 : w[tx / partial_sum]);
-  }
-}
-
-void hl_param_relu_backward_diff(real* grad_o,
-                                 real* data,
-                                 real* w,
-                                 real* diff,
-                                 int width,
-                                 int height,
-                                 int partial_sum) {
-  CHECK_NOTNULL(grad_o);
-  CHECK_NOTNULL(data);
-  CHECK_NOTNULL(w);
-  CHECK_NOTNULL(diff);
-  dim3 threads(16, 16);
-  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 - 1) / 16;
-  dim3 grid(blockX, blockY);
-  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_o, data, w, diff, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_backward_diff failed");
-}
-
-__global__ void KeMatrixAddSharedBias(
-    real* A, real* B, const int channel, const int M, const int N, real scale) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int dim = N / channel;
-  if (index < M * N) {
-    int i = index % N;
-    i = i / dim;
-    A[index] += scale * B[i];
-  }
-}
-
-void hl_matrix_add_shared_bias(real* A_d,
-                               real* B_d,
-                               const int channel,
-                               const int dimM,
-                               const int dimN,
-                               real scale) {
-  const int blocks = 512;
-  const int grids = DIVUP(dimM * dimN, blocks);
-  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      A_d, B_d, channel, dimM, dimN, scale);
-  CHECK_SYNC("hl_matrix_add_shared_bias failed");
-}
-
-template <int blockSize>
-__global__ void KeMatrixCollectSharedBias(real* B,
-                                          real* A,
-                                          const int channel,
-                                          const int M,
-                                          const int N,
-                                          const int dim,
-                                          const int limit,
-                                          real scale) {
-  if (dim < limit) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < channel) {
-      real sum = 0.0;
-      for (int i = 0; i < M; ++i) {
-        for (int j = 0; j < dim; ++j) {
-          sum += A[i * N + index * dim + j];
-        }
-      }
-      B[index] += scale * sum;
-    }
-  } else {
-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
-    __shared__ real smem[blockSize];
-    real sum = 0.0;
-    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
-      int n = j * blockSize + tid;
-      int m = n / dim;
-      int w = n % dim;
-      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
-      __syncthreads();
-      simpleReduce(smem, tid, blockSize);
-      sum += smem[0];
-    }
-    if (tid == 0) {
-      B[bid] += scale * sum;
-    }
-  }
-}
-
-void hl_matrix_collect_shared_bias(real* B_d,
-                                   real* A_d,
-                                   const int channel,
-                                   const int dimM,
-                                   const int dimN,
-                                   real scale) {
-  const int dim = dimN / channel;
-  const int blocks = 256;
-  const int limit = 64;
-  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
-
-  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
-  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
-}
-
-__global__ void keMatrixRotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < dimM * dimN) {
-    int i = idx / dimN;
-    int j = idx % dimN;
-    if (clockWise) {
-      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
-    } else {
-      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
-    }
-  }
-}
-
-void hl_matrix_rotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-  CHECK_NOTNULL(mat);
-  CHECK_NOTNULL(matRot);
-  const int threads = 512;
-  const int blocks = DIVUP(dimM * dimN, threads);
-  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
-      mat, matRot, dimM, dimN, clockWise);
-  CHECK_SYNC("hl_matrix_rotate failed");
-}
-
-__global__ void keMatrixVol2Col(int num_kernels,
-                                const real* dataSrc,
-                                real* dataDst,
-                                int depth,
-                                int height,
-                                int width,
-                                int filterD,
-                                int filterH,
-                                int filterW,
-                                int strideD,
-                                int strideH,
-                                int strideW,
-                                int paddingD,
-                                int paddingH,
-                                int paddingW,
-                                int depth_col,
-                                int height_col,
-                                int width_col) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int d_out = (index / width_col / height_col) % depth_col;
-    int channel_in = index / width_col / height_col / depth_col;
-    int channel_out = channel_in * filterD * filterH * filterW;
-    int w_in = w_out * strideW - paddingW;
-    int h_in = h_out * strideH - paddingH;
-    int d_in = d_out * strideD - paddingD;
-
-    dataDst +=
-        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
-        w_out;
-    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
-    for (int k = 0; k < filterD; ++k) {
-      for (int i = 0; i < filterH; ++i) {
-        for (int j = 0; j < filterW; ++j) {
-          int d = d_in + k;
-          int h = h_in + i;
-          int w = w_in + j;
-          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
-                      w < width)
-                         ? dataSrc[(k * height + i) * width + j]
-                         : 0;
-          dataDst += depth_col * height_col * width_col;
-        }
-      }
-    }
-  }
-}
-
-void hl_matrix_vol2Col(const real* dataSrc,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       real* dataDst) {
-  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
-  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
-  int num_kernels = channels * depth_col * height_col * width_col;
-
-  const int threads = 512;
-  const int blocks = DIVUP(num_kernels, threads);
-
-  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          dataSrc,
-                                                          dataDst,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          filterD,
-                                                          filterH,
-                                                          filterW,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          depth_col,
-                                                          height_col,
-                                                          width_col);
-  CHECK_SYNC("hl_matrix_vol2Col failed");
-}
-
-__global__ void keMatrixCol2Vol(int num_kernels,
-                                real* dataDst,
-                                const real* dataSrc,
-                                int depth,
-                                int height,
-                                int width,
-                                int filterD,
-                                int filterH,
-                                int filterW,
-                                int strideD,
-                                int strideH,
-                                int strideW,
-                                int paddingD,
-                                int paddingH,
-                                int paddingW,
-                                int depth_col,
-                                int height_col,
-                                int width_col,
-                                real alpha,
-                                real beta) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    real srcVal = 0;
-    real dstVal = dataDst[index];
-    int w = index % width + paddingW;
-    int h = (index / width) % height + paddingH;
-    int d = (index / width / height) % depth + paddingD;
-    int c = index / width / height / depth;
-    // compute the start and end of the output
-    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
-    int w_col_end = min(w / strideW + 1, width_col);
-    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
-    int h_col_end = min(h / strideH + 1, height_col);
-    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
-    int d_col_end = min(d / strideD + 1, depth_col);
-
-    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
-                  h * filterW + w) *
-                 depth_col * height_col * width_col;
-
-    int coeff_d_col =
-        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
-    int coeff_h_col =
-        (1 - strideH * filterW * depth_col * height_col) * width_col;
-    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
-
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
-                            w_col * coeff_w_col];
-        }
-      }
-    }
-    dataDst[index] = alpha * srcVal + beta * dstVal;
-  }
-}
-
-void hl_matrix_col2Vol(real* dataDst,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       const real* dataSrc,
-                       real alpha,
-                       real beta) {
-  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
-  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
-  int num_kernels = channels * depth * height * width;
-
-  const int threads = 512;
-  const int blocks = DIVUP(num_kernels, threads);
-
-  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          dataDst,
-                                                          dataSrc,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          filterD,
-                                                          filterH,
-                                                          filterW,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          depth_col,
-                                                          height_col,
-                                                          width_col,
-                                                          alpha,
-                                                          beta);
-
-  CHECK_SYNC("hl_matrix_col2Vol failed");
-}
-
-__global__ void keVectorCast2Int(int* out, real* vec, int size) {
-  for (int i = threadIdx.x; i < (size); i += blockDim.x) {
-    out[i] = int(vec[i]);
-  }
-}
-
-void hl_vector_cast2int(int* out, real* vec, int size) {
-  keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
-  CHECK_SYNC("hl_vector_cast2int failed");
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sequence.cu b/paddle/legacy/cuda/src/hl_cuda_sequence.cu
deleted file mode 100644
index 1d772b5ce..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_sequence.cu
+++ /dev/null
@@ -1,408 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-#include "paddle/legacy/utils/Logging.h"
-
-__global__ void KeMaxSequenceForward(real* input,
-                                     const int* sequence,
-                                     real* output,
-                                     int* index,
-                                     int numSequences,
-                                     int dim) {
-  int dimIdx = threadIdx.x;
-  int sequenceId = blockIdx.x;
-  if (sequenceId >= numSequences) return;
-  int start = sequence[sequenceId];
-  int end = sequence[sequenceId + 1];
-
-  for (int i = dimIdx; i < dim; i += blockDim.x) {
-    real tmp = -HL_FLOAT_MAX;
-    int tmpId = -1;
-    for (int insId = start; insId < end; insId++) {
-      if (tmp < input[insId * dim + i]) {
-        tmp = input[insId * dim + i];
-        tmpId = insId;
-      }
-    }
-    output[sequenceId * dim + i] = tmp;
-    index[sequenceId * dim + i] = tmpId;
-  }
-}
-
-void hl_max_sequence_forward(real* input,
-                             const int* sequence,
-                             real* output,
-                             int* index,
-                             int numSequences,
-                             int dim) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(index);
-
-  dim3 threads(256, 1);
-  dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      input, sequence, output, index, numSequences, dim);
-  CHECK_SYNC("hl_max_sequence_forward failed");
-}
-
-__global__ void KeMaxSequenceBackward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int colIdx = idx % dim;
-  if (idx < numSequences * dim) {
-    int insId = index[idx];
-    inputGrad[insId * dim + colIdx] += outputGrad[idx];
-  }
-}
-
-void hl_max_sequence_backward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(index);
-  CHECK_NOTNULL(inputGrad);
-
-  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
-  dim3 threads(128, 1);
-  dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      outputGrad, index, inputGrad, numSequences, dim);
-  CHECK_SYNC("hl_max_sequence_backward failed");
-}
-
-template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output,
-                                real* table,
-                                int* ids,
-                                int numSamples,
-                                int tableSize,
-                                int dim) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int sampleId = blockIdx.x + idy * gridDimX;
-
-  while (sampleId < numSamples) {
-    int tableId = ids[sampleId];
-    if ((0 <= tableId) && (tableId < tableSize)) {
-      real* outputData = output + sampleId * dim;
-      real* tableData = table + tableId * dim;
-      for (int i = idx; i < dim; i += blockDimX) {
-        if (AddRow == 0) {
-          outputData[i] += tableData[i];
-        } else {
-          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
-        }
-      }
-    }
-    sampleId += blockDimY * gridDimX;
-  }
-}
-
-template <int blockDimX,
-          int blockDimY,
-          int gridDimX,
-          bool seq2batch,
-          bool isAdd>
-__global__ void KeSequence2Batch(real* batch,
-                                 real* sequence,
-                                 const int* batchIndex,
-                                 int seqWidth,
-                                 int batchCount) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int id = blockIdx.x + idy * gridDimX;
-  while (id < batchCount) {
-    int seqId = batchIndex[id];
-    real* batchData = batch + id * seqWidth;
-    real* seqData = sequence + seqId * seqWidth;
-    for (int i = idx; i < seqWidth; i += blockDimX) {
-      if (seq2batch) {
-        if (isAdd) {
-          batchData[i] += seqData[i];
-        } else {
-          batchData[i] = seqData[i];
-        }
-      } else {
-        if (isAdd) {
-          seqData[i] += batchData[i];
-        } else {
-          seqData[i] = batchData[i];
-        }
-      }
-    }
-    id += blockDimY * gridDimX;
-  }
-}
-
-void hl_sequence2batch_copy(real* batch,
-                            real* sequence,
-                            const int* batchIndex,
-                            int seqWidth,
-                            int batchCount,
-                            bool seq2batch) {
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(batchIndex);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  }
-  CHECK_SYNC("hl_sequence2batch_copy failed");
-}
-
-void hl_sequence2batch_add(real* batch,
-                           real* sequence,
-                           int* batchIndex,
-                           int seqWidth,
-                           int batchCount,
-                           bool seq2batch) {
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(batchIndex);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  }
-  CHECK_SYNC("hl_sequence2batch_add failed");
-}
-
-template <bool normByTimes, bool seq2batch>
-__global__ void KeSequence2BatchPadding(real* batch,
-                                        real* sequence,
-                                        const int* sequenceStartPositions,
-                                        const size_t sequenceWidth,
-                                        const size_t maxSequenceLength,
-                                        const size_t numSequences) {
-  int batchIdx = blockIdx.y;
-  int sequenceStart = sequenceStartPositions[batchIdx];
-  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
-
-  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
-  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
-  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
-
-  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
-
-  if (sequenceIdx < sequenceLength) {
-    if (seq2batch) {
-      /* sequence -> batch */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
-      }
-    } else {
-      /* batch -> sequence */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
-      }
-    }
-  } else if (sequenceIdx < maxSequenceLength) {
-    if (seq2batch) {
-      /* sequence -> batch */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        batch[batchBaseIdx + i] = 0;
-      }
-    }
-  }
-}
-
-void hl_sequence2batch_copy_padding(real* batch,
-                                    real* sequence,
-                                    const int* sequenceStartPositions,
-                                    const size_t sequenceWidth,
-                                    const size_t maxSequenceLength,
-                                    const size_t numSequences,
-                                    bool normByTimes,
-                                    bool seq2batch) {
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(sequenceStartPositions);
-
-  if (!normByTimes && numSequences == 1) {
-    size_t elementCount = maxSequenceLength * sequenceWidth;
-    if (seq2batch) {
-      /* sequence -> batch */
-      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
-    } else {
-      /* batch -> sequence */
-      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
-    }
-    return;
-  }
-
-  const int CUDA_BLOCK_SIZE = 512;
-
-  /* At least use 32 threads to copy sequenceWidth elements,
-     and at least 8 elements for each thread. */
-  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
-  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
-
-  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
-  dim3 threads(blockDimX, blockDimY);
-
-  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
-  int gridDimY = numSequences;
-  dim3 grid(gridDimX, gridDimY);
-
-  if (seq2batch) {
-    /* sequence -> batch */
-    if (normByTimes) {
-      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    } else {
-      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    }
-  } else {
-    /* batch -> sequence */
-    if (normByTimes) {
-      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    } else {
-      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    }
-  }
-
-  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
-}
-
-__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
-
-__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
-
-__global__ void KeSequenceAvgForward(real* dst,
-                                     real* src,
-                                     const int* starts,
-                                     int height,
-                                     int width,
-                                     const int mode) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int row = gid / width;
-  int col = gid % width;
-
-  if (gid < height * width) {
-    int start = starts[row];
-    int end = starts[row + 1];
-    int seqLength = end - start;
-    if (seqLength == 0) return;
-    real sum = 0.0;
-    for (int i = start; i < end; i++) {
-      sum += src[i * width + col];
-    }
-    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
-                                       : sum * my_rsqrt((real)seqLength));
-    dst[gid] += sum;
-  }
-}
-
-void hl_sequence_avg_forward(real* dst,
-                             real* src,
-                             const int* starts,
-                             int height,
-                             int width,
-                             const int mode) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(starts);
-
-  int block = 512;
-  int grid = DIVUP(width * height, 512);
-
-  CHECK(mode == 0 || mode == 1 || mode == 2)
-      << "mode error in hl_sequence_avg_forward!";
-
-  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
-      dst, src, starts, height, width, mode);
-  CHECK_SYNC("hl_sequence_avg_forward failed");
-}
-
-__global__ void KeSequenceAvgBackward(real* dst,
-                                      real* src,
-                                      const int* starts,
-                                      int height,
-                                      int width,
-                                      const int mode) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int row = gid / width;
-  int col = gid % width;
-
-  if (gid < height * width) {
-    int start = starts[row];
-    int end = starts[row + 1];
-    int seqLength = end - start;
-    if (seqLength == 0) return;
-    real grad = src[gid];
-    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
-                                         : grad * my_rsqrt((real)seqLength));
-    for (int i = start; i < end; i++) {
-      dst[i * width + col] += grad;
-    }
-  }
-}
-
-void hl_sequence_avg_backward(real* dst,
-                              real* src,
-                              const int* starts,
-                              int height,
-                              int width,
-                              const int mode) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(starts);
-
-  int block = 512;
-  int grid = DIVUP(width * height, 512);
-
-  CHECK(mode == 0 || mode == 1 || mode == 2)
-      << "mode error in hl_sequence_avg_backward!";
-
-  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
-      dst, src, starts, height, width, mode);
-  CHECK_SYNC("hl_sequence_avg_backward failed");
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sparse.cu b/paddle/legacy/cuda/src/hl_cuda_sparse.cu
deleted file mode 100644
index 8065a6f9f..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_sparse.cu
+++ /dev/null
@@ -1,1262 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda.h"
-#include "hl_cuda_sparse.cuh"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "hl_sparse.h"
-#include "hl_sparse.ph"
-#include "paddle/legacy/utils/Logging.h"
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-
-void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
-  CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
-
-  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
-        A_d2->csr_col)
-      << "parameter transa error!";
-
-  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
-  dim3 grid(blocksX, blocksY);
-
-  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-  } else {
-  }
-  CHECK_SYNC("hl_matrix_csr2dense failed");
-}
-
-void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
-  CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
-
-  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
-        A_d2->csc_col)
-      << "parameter transa error!";
-
-  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
-  dim3 grid(blocksX, blocksY);
-
-  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-  } else {
-  }
-  CHECK_SYNC("hl_matrix_csc2dense failed");
-}
-
-void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
-                             hl_matrix_format_t format,
-                             hl_matrix_value_t value_type,
-                             int dimM,
-                             int dimN,
-                             int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-  CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-  /* avoid malloc 0 bytes */
-  int nnz_s = (nnz == 0 ? 1 : nnz);
-
-  if (format == HL_SPARSE_CSR) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csr->sparsity = -1.0;
-
-    if (value_type == HL_NO_VALUE) {
-      csr->csr_val = NULL;
-      csr->nnz_s = nnz_s;
-      csr->row_s = dimM + 1;
-      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csr;
-    } else if (value_type == HL_FLOAT_VALUE) {
-      csr->nnz_s = nnz_s;
-      csr->row_s = dimM + 1;
-      csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csr;
-    }
-  } else if (format == HL_SPARSE_CSC) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csc->sparsity = -1.0f;
-
-    if (value_type == HL_NO_VALUE) {
-      csc->csc_val = NULL;
-      csc->nnz_s = nnz_s;
-      csc->col_s = dimN + 1;
-      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csc;
-    } else if (value_type == HL_FLOAT_VALUE) {
-      csc->nnz_s = nnz_s;
-      csc->col_s = dimN + 1;
-      csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csc;
-    }
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
-  CHECK_NOTNULL(A_d);
-  CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (A_d->matrix == NULL) {
-    free(A_d);
-    return;
-  }
-
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_csr_matrix csr = (hl_csr_matrix)A_d->matrix;
-    if (csr->csr_val != NULL) {
-      hl_free_mem_device(csr->csr_val);
-      csr->csr_val = NULL;
-    }
-
-    if (csr->csr_row != NULL) {
-      hl_free_mem_device(csr->csr_row);
-      csr->csr_row = NULL;
-    }
-
-    if (csr->csr_col != NULL) {
-      hl_free_mem_device(csr->csr_col);
-      csr->csr_col = NULL;
-    }
-
-    A_d->matrix = NULL;
-    free(A_d);
-  } else if (A_d->format == HL_SPARSE_CSC) {
-    hl_csc_matrix csc = (hl_csc_matrix)A_d->matrix;
-    if (csc->csc_val != NULL) {
-      hl_free_mem_device(csc->csc_val);
-      csc->csc_val = NULL;
-    }
-
-    if (csc->csc_row != NULL) {
-      hl_free_mem_device(csc->csc_row);
-      csc->csc_row = NULL;
-    }
-
-    if (csc->csc_col != NULL) {
-      hl_free_mem_device(csc->csc_col);
-      csc->csc_col = NULL;
-    }
-
-    A_d->matrix = NULL;
-    free(A_d);
-  }
-}
-
-void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                void *dest_d,
-                                size_t size,
-                                hl_matrix_format_t format,
-                                hl_matrix_value_t value_type,
-                                int dimM,
-                                int dimN,
-                                int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (format == HL_SPARSE_CSR) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
-    if (value_type != HL_NO_VALUE) {
-      size_ += nnz * sizeof(real);
-    }
-    CHECK_LE(size_, size) << "dest_d size(" << size
-                          << ") too small, should bigger than(" << size_
-                          << ")!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-
-    if (value_type == HL_NO_VALUE) {
-      csr->csr_val = NULL;
-      csr->csr_row = (int *)dest_d;
-      csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
-    } else {
-      csr->csr_val = (real *)dest_d;
-      csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
-      csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
-                             (dimM + 1) * sizeof(int));
-    }
-    csr->nnz_s = nnz;
-    csr->row_s = dimM + 1;
-    csr->sparsity = -1.0;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csr;
-  } else if (format == HL_SPARSE_CSC) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
-    if (value_type != HL_NO_VALUE) {
-      size_ += nnz * sizeof(real);
-    }
-    CHECK_LE(size_, size) << "dest_d size(" << size
-                          << ") too small, should bigger than(" << size_
-                          << ")!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    if (value_type == HL_NO_VALUE) {
-      csc->csc_val = NULL;
-      csc->csc_col = (int *)dest_d;
-      csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
-    } else {
-      csc->csc_val = (real *)dest_d;
-      csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
-      csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
-                             (dimN + 1) * sizeof(int));
-    }
-    csc->nnz_s = nnz;
-    csc->col_s = dimN + 1;
-    csc->sparsity = -1.0f;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csc;
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                real *value_d,
-                                int *rows_d,
-                                int *cols_d,
-                                hl_matrix_format_t format,
-                                hl_matrix_value_t value_type,
-                                int dimM,
-                                int dimN,
-                                int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (format == HL_SPARSE_CSR) {
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csr->csr_row = rows_d;
-    csr->csr_col = cols_d;
-    csr->csr_val = value_d;
-    csr->nnz_s = nnz;
-    csr->row_s = dimM + 1;
-    csr->sparsity = -1.0;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csr;
-  } else if (format == HL_SPARSE_CSC) {
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csc->csc_row = rows_d;
-    csc->csc_col = cols_d;
-    csc->csc_val = value_d;
-    csc->nnz_s = nnz;
-    csc->col_s = dimN + 1;
-    csc->sparsity = -1.0f;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csc;
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {
-  CHECK_NOTNULL(A_d);
-  free(A_d);
-}
-
-void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
-                          real *csr_val,
-                          int *csr_row,
-                          int *csr_col,
-                          hl_stream_t stream) {
-  CHECK_NOTNULL(csr_matrix);
-  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-      << "csr_matrix is not csr format!";
-  CHECK_NOTNULL(csr_matrix->matrix);
-
-  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
-                                        << " is big than alloc size "
-                                        << csr->nnz_s;
-
-  CHECK_LE((csr_matrix->rows + 1), csr->row_s)
-      << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
-      << csr->row_s;
-
-  CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-
-  if (csr_matrix->type == HL_NO_VALUE) {
-    if (csr_row == NULL && csr_col == NULL) {
-      return;
-    } else if (csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(
-          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-
-      hl_memcpy_async(
-          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
-    }
-  } else if (csr_matrix->type == HL_FLOAT_VALUE) {
-    if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
-      return;
-    } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
-      hl_memcpy_async(
-          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-    } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(
-          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-      hl_memcpy_async(
-          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-      hl_memcpy_async(
-          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
-    }
-  }
-
-  csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
-                  ((float)csr_matrix->cols);
-}
-
-void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
-                          real *csc_val,
-                          int *csc_row,
-                          int *csc_col,
-                          hl_stream_t stream) {
-  CHECK_NOTNULL(csc_matrix);
-  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-      << "csc_matrix is not csc format error!";
-
-  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
-                                        << " is big than alloc size "
-                                        << csc->nnz_s;
-
-  CHECK_LE((csc_matrix->cols + 1), csc->col_s)
-      << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
-      << csc->col_s;
-
-  CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-
-  if (csc_matrix->type == HL_NO_VALUE) {
-    if (csc_row == NULL && csc_col == NULL) {
-      return;
-    } else if (csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(
-          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-      hl_memcpy_async(
-          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
-    }
-  } else if (csc_matrix->type == HL_FLOAT_VALUE) {
-    if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
-      return;
-    } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
-      hl_memcpy_async(
-          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-    } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(
-          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-      hl_memcpy_async(
-          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-      hl_memcpy_async(
-          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
-    }
-  }
-
-  csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
-                  ((float)csc_matrix->cols);
-}
-
-void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
-                             hl_sparse_matrix_s src,
-                             hl_stream_t stream) {
-  CHECK(dst && src && dst->matrix && src->matrix)
-      << "parameter dst or src is null pointer!";
-  CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
-  CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
-      << "src sparse matrix is no value, dst sparse matrix has value!";
-
-  if (dst->format == HL_SPARSE_CSR) {
-    dst->rows = src->rows;
-    dst->cols = src->cols;
-    dst->nnz = src->nnz;
-    hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-    hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
-  } else if (dst->format == HL_SPARSE_CSC) {
-    dst->rows = src->rows;
-    dst->cols = src->cols;
-    dst->nnz = src->nnz;
-    hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
-    hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
-  } else {
-    LOG(FATAL) << "sparse matrix format error!";
-  }
-}
-
-/**
- * Calculate beta * C, if beta is zero, C does not have to be a valid input.
- */
-static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
-  if (beta == 0.0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
-  } else {
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
-    }
-  }
-
-  return;
-}
-
-void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
-                             hl_trans_op_t transa,
-                             real *B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transb, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0);
-  CHECK_EQ(A_d->format, HL_SPARSE_CSR) << "matrix format error!";
-
-  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
-      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (A_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (HPPL_OP_N == transa) {
-    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
-    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
-    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-
-    /* sparsity pattern */
-    // A_d->sparsity;
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (HPPL_OP_T == transa) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-
-    int blocksX =
-        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY =
-        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
-    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_SYNC("hl_matrix_csr_mul_dense failed");
-}
-
-void hl_matrix_dense_mul_csc(real *A_d,
-                             hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transa, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
-      ((transb == HPPL_OP_N) && (B_d->rows != dimK || B_d->cols != dimN)) ||
-      ((transb == HPPL_OP_T) && (B_d->rows != dimN || B_d->cols != dimK))) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
-
-  if (B_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
-  if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
-    LOG(FATAL) << "parameter B is null!";
-  }
-
-  if (transb == HPPL_OP_N) {
-    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
-    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
-    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
-    dim3 grid(blocksX, blocksY);
-
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_row,
-          B_d2->csc_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_row,
-          B_d2->csc_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (transb == HPPL_OP_T) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
-    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_col,
-          B_d2->csc_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_col,
-          B_d2->csc_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transb error!";
-  }
-
-  CHECK_SYNC("hl_matrix_dense_mul_csc failed");
-}
-
-void hl_matrix_dense_mul_csr(real *A_d,
-                             hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transa, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
-      (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
-      (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
-
-  if (B_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
-  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  if (transb == HPPL_OP_N) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
-    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_row,
-          B_d2->csr_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_row,
-          B_d2->csr_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (transb == HPPL_OP_T) {
-    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
-    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
-    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_col,
-          B_d2->csr_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_col,
-          B_d2->csr_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transb error!";
-  }
-
-  CHECK_SYNC("hl_matrix_dense_mul_csr failed");
-}
-
-void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
-                             hl_trans_op_t transa,
-                             real *B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transb, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
-  CHECK_EQ(A_d->format, HL_SPARSE_CSC) << "matrix format error!";
-
-  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
-      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (A_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (HPPL_OP_N == transa) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-
-    int blocksX =
-        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY =
-        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
-    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (HPPL_OP_T == transa) {
-    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
-    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
-    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-
-    /* sparsity pattern */
-    // A_d->sparsity;
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_SYNC("hl_matrix_csc_mul_dense failed");
-}
-
-void hl_sparse_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          hl_sparse_matrix_s C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
-  CHECK_NE(C_d->type, HL_NO_VALUE) << "C value type error!";
-
-  if (C_d->nnz == 0) return;
-
-  if (C_d->format == HL_SPARSE_CSC) {
-    hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
-    if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
-        C_d2->csc_col == NULL) {
-      LOG(FATAL) << "parameter error!";
-    }
-
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(
-          unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
-    }
-
-    int blocksX = dimN;
-    int blocksY = 1;
-    dim3 threads(CU_CSCMM_DMD2CSC_THREAD_X, 1);
-    dim3 grid(blocksX, blocksY);
-    bool transA = transa == HPPL_OP_T ? 1 : 0;
-    bool transB = transb == HPPL_OP_T ? 1 : 0;
-    KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
-        C_d2->csc_val,
-        C_d2->csc_row,
-        C_d2->csc_col,
-        A_d,
-        B_d,
-        transA,
-        transB,
-        dimM,
-        dimN,
-        dimK,
-        alpha,
-        beta);
-    CHECK_SYNC("hl_sparse_matrix_mul failed");
-  } else {
-    hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
-    if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
-        C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
-      LOG(FATAL) << "parameter error!";
-    }
-
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(
-          unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
-    }
-
-    bool transA = transa == HPPL_OP_T ? 1 : 0;
-    bool transB = transb == HPPL_OP_T ? 1 : 0;
-    if (!transB) {
-      int blocksX = dimM;
-      int blocksY = 1;
-      dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
-      dim3 grid(blocksX, blocksY);
-
-      KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d2->csr_val,
-          C_d2->csr_row,
-          C_d2->csr_col,
-          A_d,
-          B_d,
-          transA,
-          transB,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-      CHECK_SYNC("hl_sparse_matrix_mul failed");
-    } else {
-      CHECK(!transA) << "Not supported A is trans and B is not trans!";
-
-      dim3 block(CU_BLOCK_SIZE, 1);
-      int avgNnzPerRow = C_d->nnz / dimM;
-      avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
-      int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
-      dim3 grid(gridx, dimM);
-      KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
-          C_d2->csr_val,
-          C_d2->csr_row,
-          C_d2->csr_col,
-          A_d,
-          B_d,
-          transA,
-          transB,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-      CHECK_SYNC("hl_sparse_matrix_mul failed");
-    }
-  }
-}
-
-void hl_memcpy_from_csc_matrix(real *csc_val,
-                               size_t val_size,
-                               int *csc_row,
-                               size_t row_size,
-                               int *csc_col,
-                               size_t col_size,
-                               hl_sparse_matrix_s csc_matrix,
-                               hl_stream_t stream) {
-  CHECK_NOTNULL(csc_matrix);
-  CHECK_NOTNULL(csc_row);
-  CHECK_NOTNULL(csc_col);
-
-  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-      << "csc_matrix is not csc format error!";
-
-  if (csc_matrix->nnz > row_size ||
-      csc_matrix->cols + 1 > static_cast<int>(col_size)) {
-    LOG(FATAL) << "size not match!";
-  }
-
-  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  hl_memcpy_async((void *)csc_row,
-                  (void *)csc->csc_row,
-                  (csc_matrix->nnz) * sizeof(int),
-                  stream);
-  hl_memcpy_async((void *)csc_col,
-                  (void *)csc->csc_col,
-                  (csc_matrix->cols + 1) * sizeof(int),
-                  stream);
-  if (csc_matrix->type == HL_FLOAT_VALUE) {
-    if (csc_val != NULL) {
-      CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void *)csc_val,
-                      (void *)csc->csc_val,
-                      (csc_matrix->nnz) * sizeof(real),
-                      stream);
-    } else {
-      LOG(FATAL) << "parameter csr_val is null pointer!";
-    }
-  }
-}
-
-void hl_memcpy_from_csr_matrix(real *csr_val,
-                               size_t val_size,
-                               int *csr_row,
-                               size_t row_size,
-                               int *csr_col,
-                               size_t col_size,
-                               hl_sparse_matrix_s csr_matrix,
-                               hl_stream_t stream) {
-  CHECK_NOTNULL(csr_matrix);
-  CHECK_NOTNULL(csr_row);
-  CHECK_NOTNULL(csr_col);
-  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-      << "csr_matrix is not csr format error!";
-
-  if (csr_matrix->nnz > col_size ||
-      csr_matrix->rows + 1 > static_cast<int>(row_size)) {
-    LOG(FATAL) << "size not match!";
-  }
-
-  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  hl_memcpy_async((void *)csr_row,
-                  (void *)csr->csr_row,
-                  (csr_matrix->rows + 1) * sizeof(int),
-                  stream);
-  hl_memcpy_async((void *)csr_col,
-                  (void *)csr->csr_col,
-                  (csr_matrix->nnz) * sizeof(int),
-                  stream);
-  if (csr_matrix->type == HL_FLOAT_VALUE) {
-    if (csr_val != NULL) {
-      CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void *)csr_val,
-                      (void *)csr->csr_val,
-                      (csr_matrix->nnz) * sizeof(real),
-                      stream);
-    } else {
-      LOG(FATAL) << "parameter csr_val is null pointer!";
-    }
-  }
-}
-
-void hl_sparse_matrix_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
-  if (B_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  if (dimM <= 0 || dimN <= 0 || (B_d->rows != dimM || B_d->cols != dimN)) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
-  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter B is null!";
-  }
-
-  if (B_d->nnz == 0) return;
-
-  int nnz = B_d->nnz;
-  int block = 512;
-  int grid = DIVUP(nnz, 512);
-  KeSMatrixCsrColumnSum<<<grid, block, 0, STREAM_DEFAULT>>>(
-      A_d, B_d2->csr_val, B_d2->csr_col, nnz);
-
-  CHECK_SYNC("hl_matrix_csr_column_sum failed");
-}
-
-void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_add_bias(A_d, B_d, scale);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter A_d is null!";
-  }
-
-  if (A_d->nnz == 0) return;
-
-  int nnz = A_d->nnz;
-  int block = 512;
-  int grid = DIVUP(nnz, 512);
-  KeSMatrixCsrAddBias<<<grid, block, 0, STREAM_DEFAULT>>>(
-      A_d2->csr_val, A_d2->csr_col, B_d, scale, nnz);
-
-  CHECK_SYNC("hl_sparse_matrix_add_bias failed");
-}
-
-void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                real *B_d,
-                                int dimM,
-                                int dimN,
-                                real alpha,
-                                real beta) {
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                             real *B_d,
-                             int dimM,
-                             int dimN,
-                             real alpha,
-                             real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  if (dimM <= 0 || dimN <= 0 || A_d->rows != dimM || A_d->cols != dimN) {
-    LOG(FATAL) << "parameter dim error!";
-  }
-
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter A_d is null!";
-  }
-
-  if (A_d->nnz == 0) return;
-
-  int gridX = DIVUP((A_d->nnz / dimM), 512);
-  gridX = gridX > 0 ? gridX : 1;
-  dim3 block(512, 1);
-  dim3 grid(gridX, dimM);
-  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
-                                                           A_d2->csr_row,
-                                                           A_d2->csr_col,
-                                                           B_d,
-                                                           alpha,
-                                                           beta,
-                                                           dimM,
-                                                           dimN);
-
-  CHECK_SYNC("hl_sparse_matrix_add_dense failed");
-}
-
-int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, row);
-}
-
-int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, col);
-}
-
-real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, val);
-}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sparse.cuh b/paddle/legacy/cuda/src/hl_cuda_sparse.cuh
deleted file mode 100644
index adb898c9a..000000000
--- a/paddle/legacy/cuda/src/hl_cuda_sparse.cuh
+++ /dev/null
@@ -1,1015 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#include "hl_device_functions.cuh"
-
-template <int VALUE_TYPE>
-__device__ real findvalue(real* csr_val,
-                          int* csr_col,
-                          int col_start,
-                          int col_end,
-                          int index) {
-  int start = col_start;
-  int end = col_end-1;
-  int mid = -1;
-
-  while (start < end) {
-    mid = start + ((end - start) / 2);
-    if (csr_col[mid] < index)
-      start = mid + 1;
-    else
-      end = mid;
-  }
-
-  if ((start < col_end) && (csr_col[start] == index)) {
-    real ret = VALUE_TYPE == 0 ? 1.0 : csr_val[start];
-    return ret;
-  } else {
-    return 0.0;
-  }
-}
-
-#define     CU_CSR2DENSE_THREAD_X   16
-#define     CU_CSR2DENSE_THREAD_Y   16
-template <int VALUE_TYPE>
-__global__ void KeSMatrixCsr2Dense(real * csr_val,
-                                   int * csr_row,
-                                   int * csr_col,
-                                   real * C_d,
-                                   const int dimM,
-                                   const int dimN) {
-  const int row = blockIdx.y*blockDim.y+threadIdx.y;
-  const int col = blockIdx.x*blockDim.x+threadIdx.x;
-
-  if (row >= dimM || col >= dimN) {
-    return;
-  }
-
-  int start = csr_row[row];
-  int end = csr_row[row+1];
-
-  real sum = findvalue<VALUE_TYPE>(csr_val, csr_col, start, end, col);
-  C_d[row*dimN + col] = sum;
-}
-
-template <int VALUE_TYPE>
-__global__ void KeSMatrixCsc2Dense(real * csc_val,
-                                   int * csc_row,
-                                   int * csc_col,
-                                   real * C_d,
-                                   const int dimM,
-                                   const int dimN) {
-  const int row = blockIdx.y*blockDim.y+threadIdx.y;
-  const int col = blockIdx.x*blockDim.x+threadIdx.x;
-
-  if (row >= dimM || col >= dimN) {
-    return;
-  }
-
-  int start = csc_col[col];
-  int end = csc_col[col+1];
-
-  real sum = findvalue<VALUE_TYPE>(csc_val, csc_row, start, end, row);
-  C_d[row*dimN + col] = sum;
-}
-
-__device__ __forceinline__
-void _calculate_c(real &c, real sum) {
-  c = sum;
-}
-__device__ __forceinline__
-void _calculate_c(real &c, real sum, real beta) {
-  c = sum + beta * c;
-}
-
-#define     CU_CSRMM_N                  4
-#define     CU_CSRMM_THREAD_X           32
-#define     CU_CSRMM_THREAD_Y           32
-#define     CU_CSRMM_BLOCK_N            (32*CU_CSRMM_N)
-#define     CU_CSRMM_SHARED_ELEMENT     (2*CU_CSRMM_THREAD_X)
-template <int VALUE_TYPE>
-__global__ void KeSMatrixCsrMulDense(real *C_d,
-                                     real * csr_val,
-                                     int * csr_col,
-                                     int * csr_row,
-                                     real *B_d,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  const int index_m = blockIdx.y*CU_CSRMM_THREAD_Y+threadIdx.y;
-  int index_n = blockIdx.x*CU_CSRMM_BLOCK_N+threadIdx.x;
-
-  __shared__ real csr_val_sh[CU_CSRMM_THREAD_Y][CU_CSRMM_SHARED_ELEMENT];
-  __shared__ int csr_col_sh[CU_CSRMM_THREAD_Y][CU_CSRMM_SHARED_ELEMENT];
-
-  if (index_m >= dimM) {
-    return;
-  }
-
-  // possible optimization, cache this in shared memory
-  int csr_start = csr_row[index_m];
-  int csr_end = csr_row[index_m+1];
-  int csr_index =  csr_start + idx;
-
-  int csr_iter = (csr_end-csr_start)/CU_CSRMM_SHARED_ELEMENT;
-  int csr_rem = (csr_end-csr_start)%CU_CSRMM_SHARED_ELEMENT;
-
-  int index_k = -1;
-  real sum[CU_CSRMM_N] = {0};
-  real b_r[CU_CSRMM_N] = {0};
-
-  for (int csr_i = 0; csr_i < csr_iter; csr_i++) {
-    #pragma unroll
-    for (int i = 0; i < (CU_CSRMM_SHARED_ELEMENT/CU_CSRMM_THREAD_X); i++) {
-      if (VALUE_TYPE != 0) {
-        csr_val_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_val[csr_index];
-      }
-      csr_col_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_col[csr_index];
-      csr_index += CU_CSRMM_THREAD_X;
-    }
-
-    for (int index = 0; index < CU_CSRMM_SHARED_ELEMENT; index++) {
-      index_k = csr_col_sh[idy][index];
-      real a_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idy][index];
-      int tmp_index = index_n;
-      real *B_d_r = B_d + tmp_index;
-      #pragma unroll
-      for (int n = 0; n < CU_CSRMM_N; n++) {
-        if (tmp_index >= dimN) break;
-        b_r[n] = B_d_r[index_k*dimN];
-        B_d_r += CU_CSRMM_THREAD_X;
-        tmp_index += CU_CSRMM_THREAD_X;
-      }
-
-      #pragma unroll
-      for (int n = 0; n < CU_CSRMM_N; n++) {
-        sum[n] = VALUE_TYPE == 0 ? sum[n] + b_r[n] : sum[n] + a_r*b_r[n];
-      }
-    }
-    // __syncthreads();
-  }
-
-  if (csr_rem != 0) {
-    #pragma unroll
-    for (int i = 0; i < (CU_CSRMM_SHARED_ELEMENT/CU_CSRMM_THREAD_X); i++) {
-      if (csr_index < csr_end) {
-        if (VALUE_TYPE != 0) {
-            csr_val_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_val[csr_index];
-        }
-        csr_col_sh[idy][idx + i*CU_CSRMM_THREAD_X] = csr_col[csr_index];
-      }
-      csr_index += CU_CSRMM_THREAD_X;
-    }
-    // __syncthreads();
-
-    #pragma unroll
-    for (int index = 0; index < csr_rem; index++) {
-      index_k = csr_col_sh[idy][index];
-      real a_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idy][index];
-      int tmp_index = index_n;
-      real *B_d_r = B_d + tmp_index;
-      #pragma unroll
-      for (int n = 0; n < CU_CSRMM_N; n++) {
-        if (tmp_index >= dimN) break;
-        b_r[n] = B_d_r[index_k*dimN];
-        B_d_r += CU_CSRMM_THREAD_X;
-        tmp_index += CU_CSRMM_THREAD_X;
-      }
-
-      #pragma unroll
-      for (int n = 0; n < CU_CSRMM_N; n++) {
-        sum[n] = VALUE_TYPE == 0 ? sum[n] + b_r[n] : sum[n] + a_r*b_r[n];
-      }
-    }
-  }
-
-  C_d += __mul24(index_m, dimN);
-  if (beta == 0.0) {
-    for (int n = 0; n < CU_CSRMM_N; n++) {
-      if (index_n < dimN) {
-        _calculate_c(C_d[index_n], alpha * sum[n]);
-        index_n += CU_CSRMM_THREAD_X;
-      }
-    }
-  } else {
-    for (int n = 0; n < CU_CSRMM_N; n++) {
-      if (index_n < dimN) {
-        _calculate_c(C_d[index_n], alpha * sum[n], beta);
-        index_n += CU_CSRMM_THREAD_X;
-      }
-    }
-  }
-}
-
-#define CU_CSC_MUL_DENSE_THREAD_N           1
-#define CU_CSC_MUL_DENSE_THREAD_X           32
-#define CU_CSC_MUL_DENSE_THREAD_Y           4
-#define CU_CSC_MUL_DENSE_BLOCK_K            (CU_CSC_MUL_DENSE_THREAD_Y)
-#define CU_CSC_MUL_DENSE_BLOCK_N            \
-        (CU_CSC_MUL_DENSE_THREAD_N * CU_CSC_MUL_DENSE_THREAD_X)
-#define CU_CSC_MUL_DENSE_SHARED_ELEMENT     (CU_CSC_MUL_DENSE_THREAD_X)
-template <int VALUE_TYPE>
-__global__ void KeSMatrixCscMulDense(real *C_d,
-                                     real * csc_val,
-                                     int * csc_row,
-                                     int * csc_col,
-                                     real *B_d,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  const int index_k = blockIdx.y*CU_CSC_MUL_DENSE_BLOCK_K+threadIdx.y;
-  const int index_n = blockIdx.x*CU_CSC_MUL_DENSE_BLOCK_N+threadIdx.x;
-
-  if (index_k >= dimK) {
-    return;
-  }
-
-  __shared__
-  real csc_val_sh[CU_CSC_MUL_DENSE_THREAD_Y][CU_CSC_MUL_DENSE_SHARED_ELEMENT];
-  __shared__
-  int csc_row_sh[CU_CSC_MUL_DENSE_THREAD_Y][CU_CSC_MUL_DENSE_SHARED_ELEMENT];
-
-  // possible optimization, cache this in shared memory
-  int csc_start = csc_col[index_k];
-  int csc_end = csc_col[index_k+1];
-  int csc_index = csc_start + idx;
-  int csc_iter = (csc_end-csc_start)/CU_CSC_MUL_DENSE_SHARED_ELEMENT;
-  int csc_rem = (csc_end-csc_start)%CU_CSC_MUL_DENSE_SHARED_ELEMENT;
-  int index_m = -1;
-
-  real b_r[CU_CSC_MUL_DENSE_THREAD_N] = {0};
-  real *B_d_r;
-  real *C_d_r;
-  int index_n_t;
-  B_d += index_n + __mul24(index_k, dimN);
-  C_d += index_n;
-  for (int csr_i = 0; csr_i < csc_iter; csr_i++) {
-    #pragma unroll
-    for (int i = 0;
-         i < (CU_CSC_MUL_DENSE_SHARED_ELEMENT/CU_CSC_MUL_DENSE_THREAD_X); i++) {
-      if (VALUE_TYPE != 0) {
-        csc_val_sh[idy][idx + i*CU_CSC_MUL_DENSE_THREAD_X] = csc_val[csc_index];
-      }
-      csc_row_sh[idy][idx + i*CU_CSC_MUL_DENSE_THREAD_X] = csc_row[csc_index];
-      csc_index += CU_CSC_MUL_DENSE_THREAD_X;
-    }
-
-    #pragma unroll
-    for (int index = 0; index < CU_CSC_MUL_DENSE_SHARED_ELEMENT; index++) {
-      index_m = csc_row_sh[idy][index];
-      real a_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-      B_d_r = B_d;
-      C_d_r = C_d + __mul24(index_m, dimN);
-
-      index_n_t = index_n;
-      #pragma unroll
-      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
-        if (index_n_t < dimN) {
-          b_r[n] = B_d_r[0];
-          B_d_r += CU_CSC_MUL_DENSE_THREAD_X;
-          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
-        }
-      }
-
-      index_n_t = index_n;
-      #pragma unroll
-      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
-        if (index_n_t < dimN) {
-          real tmp;
-          tmp = alpha*a_r*b_r[n];
-          paddle::paddleAtomicAdd(C_d_r, tmp);
-          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
-          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
-        }
-      }
-    }
-    // __syncthreads();
-  }
-
-  if (csc_rem != 0) {
-    #pragma unroll
-    for (int i = 0;
-         i < (CU_CSC_MUL_DENSE_SHARED_ELEMENT/CU_CSC_MUL_DENSE_THREAD_X); i++) {
-      if (csc_index < csc_end) {
-        if (VALUE_TYPE != 0) {
-          csc_val_sh[idy][idx + i * CU_CSC_MUL_DENSE_THREAD_X] =
-            csc_val[csc_index];
-        }
-        csc_row_sh[idy][idx + i * CU_CSC_MUL_DENSE_THREAD_X] =
-          csc_row[csc_index];
-      }
-      csc_index += CU_CSC_MUL_DENSE_THREAD_X;
-    }
-    // __syncthreads();
-
-    #pragma unroll
-    for (int index = 0; index < csc_rem; index++) {
-      index_m = csc_row_sh[idy][index];
-      real a_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-      B_d_r = B_d;
-      C_d_r = C_d + __mul24(index_m, dimN);
-
-      index_n_t = index_n;
-      #pragma unroll
-      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
-        if (index_n_t < dimN) {
-          b_r[n] = B_d_r[0];
-          B_d_r += CU_CSC_MUL_DENSE_THREAD_X;
-          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
-        }
-      }
-
-      index_n_t = index_n;
-      #pragma unroll
-      for (int n = 0; n < CU_CSC_MUL_DENSE_THREAD_N; n++) {
-        if (index_n_t < dimN) {
-          real tmp;
-          tmp = alpha*a_r*b_r[n];
-          paddle::paddleAtomicAdd(C_d_r, tmp);
-          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
-          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
-        }
-      }
-    }
-  }
-}
-
-/* best perf */
-#ifndef PADDLE_TYPE_DOUBLE
-#define CU_CSCMM_THREAD_M_BEST          9
-#else
-#define CU_CSCMM_THREAD_M_BEST          4
-#endif
-#define CU_CSCMM_THREAD_X_BEST          32
-#define CU_CSCMM_THREAD_Y_BEST          32
-#define CU_CSCMM_BLOCK_M_BEST  (CU_CSCMM_THREAD_M_BEST * CU_CSCMM_THREAD_X_BEST)
-#define CU_CSCMM_BLOCK_N_BEST  (CU_CSCMM_THREAD_Y_BEST)
-template <int VALUE_TYPE>
-__global__ void KeSMatrixDenseMulCsc(real *C_d,
-                                     const real *A_d,
-                                     const real *csc_val,
-                                     const int *csc_row,
-                                     const int *csc_col,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  __shared__ real csc_val_sh[CU_CSCMM_BLOCK_N_BEST][CU_CSCMM_THREAD_X_BEST];
-  __shared__ int csc_row_sh[CU_CSCMM_BLOCK_N_BEST][CU_CSCMM_THREAD_X_BEST];
-  __shared__ real A_s[CU_CSCMM_BLOCK_M_BEST][CU_CSCMM_THREAD_Y_BEST+1];
-
-  int iter_k = dimK/CU_CSCMM_THREAD_Y_BEST;
-  int rem_k = dimK%CU_CSCMM_THREAD_Y_BEST;
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  const int index_n = blockIdx.y*CU_CSCMM_BLOCK_N_BEST+threadIdx.y;
-
-  int csc_start;
-  int csc_end;
-  if (index_n < dimN) {
-    csc_start = csc_col[index_n];
-    csc_end = csc_col[index_n+1];
-  } else {
-    csc_start = 0;
-    csc_end = 0;
-  }
-  int csc_index =  csc_start + idx;
-  int csc_iter = (csc_end-csc_start)/CU_CSCMM_THREAD_X_BEST;
-  int csc_rem = (csc_end-csc_start)%CU_CSCMM_THREAD_X_BEST;
-  int index_k = -1;
-
-  if (csc_index < csc_end) {
-    if (VALUE_TYPE != 0) {
-      csc_val_sh[idy][idx] = csc_val[csc_index];
-    }
-    csc_row_sh[idy][idx] = csc_row[csc_index];
-    csc_index += CU_CSCMM_THREAD_X_BEST;
-  }
-
-  const int ibx = blockIdx.x * CU_CSCMM_BLOCK_M_BEST;
-  int dim = ibx+idy;
-  A_d += idx + __mul24(dim, dimK);
-  #pragma unroll
-  for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-    A_s[idy + m * 32][idx] = 0.0f;
-    if (dim + m * 32 < dimM && idx < dimK) {
-      A_s[idy + m * 32][idx] = A_d[m * 32 * dimK];
-    }
-  }
-  __syncthreads();
-
-  real b_r;
-  real a_r[CU_CSCMM_THREAD_M_BEST] = {0};
-  real sum[CU_CSCMM_THREAD_M_BEST] = {0};
-  real A_r_s[CU_CSCMM_THREAD_M_BEST] = {0};
-  int index = 0;
-  int block_end_k = 0;;
-  int index_iter_csc = csc_iter;
-
-  for (int i_k = 0; i_k < iter_k; i_k++) {
-    A_d += CU_CSCMM_THREAD_Y_BEST;
-    block_end_k += CU_CSCMM_THREAD_Y_BEST;
-    #pragma unroll
-    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-      if (dim + m*32 < dimM && (idx + (i_k+1)*CU_CSCMM_THREAD_Y_BEST < dimK)) {
-        A_r_s[m] = A_d[m*32*dimK];
-      } else {
-        A_r_s[m] = 0.0f;
-      }
-    }
-
-    if (index_iter_csc > 0) {
-      goto WARP_SYNC;
-    } else {
-      goto WARP_SYNC_2;
-    }
-
-    while (index_iter_csc) {
-      if (VALUE_TYPE != 0) {
-        csc_val_sh[idy][idx] = csc_val[csc_index];
-      }
-      csc_row_sh[idy][idx] = csc_row[csc_index];
-      csc_index += CU_CSCMM_THREAD_X_BEST;
-      index = 0;
-
-WARP_SYNC:
-      for (; index < CU_CSCMM_THREAD_X_BEST; index++) {
-        index_k = csc_row_sh[idy][index];
-        if (index_k >= block_end_k) {
-          goto BLOCK_SYNC;
-        }
-        b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-        #pragma unroll
-        for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-          a_r[m] = A_s[idx+m*32][index_k-i_k*CU_CSCMM_THREAD_Y_BEST];
-          sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
-        }
-      }
-      index_iter_csc--;
-    }
-
-    if (csc_rem != 0) {
-      if (csc_iter != 0) {
-        if (csc_index < csc_end) {
-          if (VALUE_TYPE != 0) {
-            csc_val_sh[idy][idx] = csc_val[csc_index];
-          }
-          csc_row_sh[idy][idx] = csc_row[csc_index];
-          csc_index += CU_CSCMM_THREAD_X_BEST;
-        }
-        index = 0;
-      }
-      __threadfence_block();
-
-WARP_SYNC_2:
-      for (; index < csc_rem; index++) {
-        index_k = csc_row_sh[idy][index];
-        if (index_k >= block_end_k) {
-          goto BLOCK_SYNC;
-        }
-        b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-        #pragma unroll
-        for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-          a_r[m] = A_s[idx+m*32][index_k-i_k*CU_CSCMM_THREAD_Y_BEST];
-          sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
-        }
-      }
-    }
-
-BLOCK_SYNC:
-    __syncthreads();
-    #pragma unroll
-    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-      A_s[idy+m*32][idx] = A_r_s[m];
-    }
-    __syncthreads();
-  }
-
-  if (rem_k != 0) {
-    if (index_iter_csc == 0) {
-      goto TEMP_TEST;
-    }
-
-    for (; index < CU_CSCMM_THREAD_X_BEST; index++) {
-      index_k = csc_row_sh[idy][index];
-      if (index_k >= dimK) {
-        break;
-      }
-
-      b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-      #pragma unroll
-      for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-        a_r[m] = A_s[idx+m*32][index_k-iter_k*CU_CSCMM_THREAD_Y_BEST];
-        sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
-      }
-    }
-
-    if (csc_rem != 0) {
-      if (csc_index < csc_end) {
-        if (VALUE_TYPE != 0) {
-          csc_val_sh[idy][idx] = csc_val[csc_index];
-        }
-        csc_row_sh[idy][idx] = csc_row[csc_index];
-        csc_index += CU_CSCMM_THREAD_X_BEST;
-      }
-      index = 0;
-
-TEMP_TEST:
-      for (; index < csc_rem; index++) {
-        index_k = csc_row_sh[idy][index];
-        if (index_k >= dimK) {
-            break;
-        }
-        b_r = VALUE_TYPE == 0 ? 1.0 : csc_val_sh[idy][index];
-        #pragma unroll
-        for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-          a_r[m] = A_s[idx+m*32][index_k-iter_k*CU_CSCMM_THREAD_Y_BEST];
-          sum[m] = VALUE_TYPE == 0 ? sum[m] + a_r[m] : sum[m] + a_r[m]*b_r;
-        }
-      }
-    }
-  }
-
-  __syncthreads();
-  #pragma unroll
-  for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-    A_s[idx+m*32][idy] = alpha*sum[m];
-  }
-  __syncthreads();
-
-  int index_m_c = ibx + idy;
-  int index_n_c = blockIdx.y*CU_CSCMM_BLOCK_N_BEST + idx;
-  C_d += index_n_c + __mul24(index_m_c, dimN);
-  if (beta == 0.0) {
-    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-      if (index_m_c < dimM && index_n_c < dimN) {
-        _calculate_c(C_d[0], A_s[idy + m * 32][idx]);
-      }
-      index_m_c += 32;
-      C_d += dimN*32;
-    }
-  } else {
-    for (int m = 0; m < CU_CSCMM_THREAD_M_BEST; m++) {
-      if (index_m_c < dimM && index_n_c < dimN) {
-        _calculate_c(C_d[0], A_s[idy + m * 32][idx], beta);
-      }
-      index_m_c += 32;
-      C_d += dimN*32;
-    }
-  }
-}
-
-#define     CU_DM_CSR_THREAD_X           32
-#define     CU_DM_CSR_THREAD_Y           4
-#define     CU_DM_CSR_N                  4
-#define     CU_DM_CSR_BLOCK_M            (CU_DM_CSR_N*CU_DM_CSR_THREAD_Y)
-#define     CU_DM_CSR_BLOCK_K            (CU_DM_CSR_THREAD_X)
-#define     CU_DM_CSR_SHARED_ELEMENT     (1*CU_DM_CSR_THREAD_Y)
-template <int VALUE_TYPE>
-__global__ void KeSMatrixDenseMulCsr(real *C_d,
-                                     real *A_d,
-                                     real *csr_val,
-                                     const int *csr_row,
-                                     const int *csr_col,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  int index_k = __mul24(blockIdx.x, CU_DM_CSR_THREAD_X) + threadIdx.x;
-  int index_m = __mul24(blockIdx.y, CU_DM_CSR_BLOCK_M) +
-    __mul24(threadIdx.y, CU_DM_CSR_N);
-
-  if (index_k >= dimK) {
-    return;
-  }
-
-  __shared__ real csr_val_sh[CU_DM_CSR_THREAD_X][CU_DM_CSR_SHARED_ELEMENT];
-  __shared__ int csr_col_sh[CU_DM_CSR_THREAD_X][CU_DM_CSR_SHARED_ELEMENT];
-
-  // possible optimization, cache this in shared memory
-  int csr_start = csr_row[index_k];
-  int csr_end = csr_row[index_k+1];
-  int csr_index =  csr_start + idy;
-  int csr_iter = (csr_end-csr_start)/CU_DM_CSR_SHARED_ELEMENT;
-  int csr_rem = (csr_end-csr_start)%CU_DM_CSR_SHARED_ELEMENT;
-
-  real tmp = 0.0;
-  int index_n = -1;
-  int index_m_t = index_m;
-  real a_r[CU_DM_CSR_N] = {0};
-  real *A_d_tmp = A_d + __mul24(index_m, dimK) + index_k;
-  real *A_d_r = A_d_tmp;
-
-  #pragma unroll
-  for (int n=0; n < CU_DM_CSR_N; n++) {
-    if ( index_m_t++ < dimM ) {
-      a_r[n] = A_d_r[0];
-      A_d_r += dimK;
-    }
-  }
-
-  for (int csr_i = 0; csr_i < csr_iter; csr_i++) {
-    #pragma unroll
-    for (int i = 0; i < (CU_DM_CSR_SHARED_ELEMENT/CU_DM_CSR_THREAD_Y); i++) {
-      if (VALUE_TYPE != 0) {
-        csr_val_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_val
-        [csr_index];
-      }
-      csr_col_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_col[csr_index];
-      csr_index += CU_DM_CSR_THREAD_Y;
-    }
-    __syncthreads();
-
-    #pragma unroll
-    for (int index = 0; index < CU_DM_CSR_SHARED_ELEMENT; index++) {
-      index_n = csr_col_sh[idx][index];
-      real b_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idx][index];
-      real *C_d_r = C_d + __mul24(index_m, dimN) + index_n;
-
-      index_m_t = index_m;
-      #pragma unroll
-      for (int n=0; n < CU_DM_CSR_N; n++) {
-        if (index_m_t++ < dimM) {
-          tmp = alpha * b_r * a_r[n];
-          paddle::paddleAtomicAdd(C_d_r, tmp);
-          C_d_r += dimN;
-        }
-      }
-    }
-    __syncthreads();
-  }
-
-  if (csr_rem != 0) {
-    #pragma unroll
-    for (int i = 0; i < (CU_DM_CSR_SHARED_ELEMENT/CU_DM_CSR_THREAD_Y); i++) {
-      if (csr_index < csr_end) {
-        if (VALUE_TYPE !=0) {
-          csr_val_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_val[csr_index];
-        }
-        csr_col_sh[idx][idy + i*CU_DM_CSR_THREAD_Y] = csr_col[csr_index];
-      }
-      csr_index += CU_DM_CSR_THREAD_Y;
-    }
-    __syncthreads();
-
-    #pragma unroll
-    for (int index = 0; index < csr_rem; index++) {
-      index_n = csr_col_sh[idx][index];
-      real b_r = VALUE_TYPE == 0 ? 1.0 : csr_val_sh[idx][index];
-      real *C_d_r = C_d + __mul24(index_m, dimN) + index_n;
-      index_m_t = index_m;
-      #pragma unroll
-      for (int n=0; n < CU_DM_CSR_N; n++) {
-        if (index_m_t++ < dimM) {
-          tmp = alpha * b_r * a_r[n];
-          paddle::paddleAtomicAdd(C_d_r, tmp);
-          C_d_r += dimN;
-        }
-      }
-    }
-  }
-}
-
-#define     CU_CSCMM_DMD2CSC_THREAD_X   128
-#define     CU_CSCMM_DMD2CSC_SHARE_X    128
-__global__ void KeSMatrixDenseMulDense2CSC(real *csc_val,
-                                           const int *csc_row,
-                                           const int *csc_col,
-                                           real *A_d,
-                                           real *B_d,
-                                           bool trans_A,
-                                           bool trans_B,
-                                           int dimM,
-                                           int dimN,
-                                           int dimK,
-                                           real alpha,
-                                           real beta) {
-  __shared__ real B_s[CU_CSCMM_DMD2CSC_SHARE_X];
-  const int idx = threadIdx.x;  // one block compute one column
-  const int ibx = blockIdx.x;  // col index
-  int csc_start;
-  int csc_end;
-  if (ibx < dimN) {
-    csc_start = csc_col[ibx];
-    csc_end = csc_col[ibx + 1];
-  } else {
-    csc_start = 0;
-    csc_end = 0;
-  }
-
-  int iter_num = dimK / CU_CSCMM_DMD2CSC_SHARE_X;
-  int iter_rem = dimK % CU_CSCMM_DMD2CSC_SHARE_X;
-  real * B_tmp = B_d + ibx;  // column index
-
-  for (int j = 0; j < iter_num; j++) {
-    int rowStart = (j * CU_CSCMM_DMD2CSC_SHARE_X + idx) * dimN;
-    int index = rowStart;
-    for (int m = idx;
-         m < CU_CSCMM_DMD2CSC_SHARE_X; m += CU_CSCMM_DMD2CSC_THREAD_X) {
-     B_s[m] = B_tmp[index];
-     index = index + CU_CSCMM_DMD2CSC_THREAD_X * dimN;
-    }
-    __syncthreads();
-
-    for (int i = csc_col[ibx] + idx;
-         i < csc_col[ibx + 1]; i += CU_CSCMM_DMD2CSC_THREAD_X) {
-      int row = csc_row[i];  // row Index
-      /* compute C[row, ibx] */
-      float results = 0;
-      if (!trans_A) {
-        int index = row * dimK + j * CU_CSCMM_DMD2CSC_SHARE_X;
-        for (int k = 0; k < CU_CSCMM_DMD2CSC_SHARE_X; k++) {
-          results += A_d[index + k] * B_s[k];
-        }
-      } else {
-        int  index = j * CU_CSCMM_DMD2CSC_SHARE_X;
-        for (int k = 0; k < CU_CSCMM_DMD2CSC_SHARE_X; k++) {
-          results += A_d[(index + k) * dimM + row] * B_s[k];
-        }
-      }
-      csc_val[i]  += results * alpha;
-    }
-  }
-
-  if (iter_rem) {
-    int rowStart = (iter_num * CU_CSCMM_DMD2CSC_SHARE_X + idx) * dimN;
-    int index = rowStart;
-    // #pragma unroll
-    for (int m = idx; m < iter_rem;  m += CU_CSCMM_DMD2CSC_THREAD_X) {
-      B_s[m] = B_tmp[index];
-      index = index + CU_CSCMM_DMD2CSC_THREAD_X * dimN;
-    }
-    __syncthreads();
-    for (int i = csc_start + idx;
-         i < csc_end; i += CU_CSCMM_DMD2CSC_THREAD_X) {
-      int row = csc_row[i];  // row Index
-      /* compute C[row, ibx] */
-      float results = 0;
-      if (!trans_A) {
-        int index = row * dimK + iter_num * CU_CSCMM_DMD2CSC_SHARE_X;
-        for (int k = 0; k < iter_rem; k++) {
-          results += A_d[index + k] * B_s[k];
-        }
-      } else {
-        int  index =  iter_num * CU_CSCMM_DMD2CSC_SHARE_X;
-        for (int k = 0; k < iter_rem; k++) {
-          results += A_d[(index + k) * dimM + row] * B_s[k];
-        }
-      }
-      csc_val[i] += alpha * results;
-    }
-  }
-}
-
-#define     CU_CSCMM_DMD2CSR_THREAD_X   128
-#define     CU_CSCMM_DMD2CSR_SHARE_X    128
-__global__ void KeSMatrixDenseMulDense2CSR(real *csr_val,
-                                     const int *csr_row,
-                                     const int *csr_col,
-                                     real *A_d,
-                                     real *B_d,
-                                     bool  trans_A,
-                                     bool  trans_B,
-                                     int dimM,
-                                     int dimN,
-                                     int dimK,
-                                     real alpha,
-                                     real beta) {
-  __shared__ real A_s[CU_CSCMM_DMD2CSR_SHARE_X];
-  const int idx = threadIdx.x;  // one block comput one row
-  const int ibx = blockIdx.x;  // row index
-
-  int csr_start;
-  int csr_end;
-  if (ibx < dimM) {
-    csr_start = csr_row[ibx];
-    csr_end = csr_row[ibx+1];
-  } else {
-    csr_start = 0;
-    csr_end = 0;
-  }
-
-  int iter_num = dimK / CU_CSCMM_DMD2CSR_SHARE_X;
-  int csr_rem = dimK % CU_CSCMM_DMD2CSR_SHARE_X;
-  for (int j = 0; j < iter_num; j++) {
-    if (!trans_A) {
-      int colStart = j * CU_CSCMM_DMD2CSR_SHARE_X + ibx * dimK;
-      int index = colStart + idx;
-      #pragma unroll
-      for (int m = idx;
-           m < CU_CSCMM_DMD2CSR_SHARE_X; m += CU_CSCMM_DMD2CSR_THREAD_X) {
-        A_s[m] = A_d[index];
-        index = index + CU_CSCMM_DMD2CSR_THREAD_X;
-      }
-    } else {
-      int colStart = (j * CU_CSCMM_DMD2CSR_SHARE_X) * dimM  + ibx;
-      int index = colStart + idx * dimM;
-      for (int m = idx;
-           m < CU_CSCMM_DMD2CSR_SHARE_X; m += CU_CSCMM_DMD2CSR_THREAD_X) {
-        A_s[m] = A_d[index];
-        index = index + CU_CSCMM_DMD2CSR_THREAD_X * dimM;
-      }
-    }
-    __syncthreads();
-    for (int i = csr_start + idx; i < csr_end; i += CU_CSCMM_DMD2CSR_THREAD_X) {
-      int col_idx =  csr_col[i];  // col index
-      /* comput C[ibx, col_idx] */
-      real results = 0;
-      int index = (j * CU_CSCMM_DMD2CSR_SHARE_X) * dimN + col_idx;
-      for (int k = 0; k < CU_CSCMM_DMD2CSR_SHARE_X; k++) {
-        results += A_s[k] * B_d[k * dimN + index];
-      }
-      csr_val[i] += alpha * results;
-    }
-  }
-
-  if (csr_rem) {
-    if (!trans_A) {
-      int colStart = (ibx + 1) * dimK- csr_rem;
-      int index = colStart + idx;
-      #pragma unroll
-      for (int m = idx; m < csr_rem; m += CU_CSCMM_DMD2CSR_THREAD_X) {
-        A_s[m] = A_d[index];
-        index = index + CU_CSCMM_DMD2CSR_THREAD_X;
-      }
-     } else {
-        int colStart = (iter_num * CU_CSCMM_DMD2CSR_SHARE_X) * dimM  + ibx;
-        int index = colStart + idx * dimM;
-        for (int m = idx; m < csr_rem;  m += CU_CSCMM_DMD2CSR_THREAD_X) {
-          A_s[m] = A_d[index];
-          index = index + CU_CSCMM_DMD2CSR_THREAD_X * dimM;
-        }
-     }
-     __syncthreads();
-     for (int i = csr_start + idx;
-          i < csr_end; i += CU_CSCMM_DMD2CSR_THREAD_X) {
-       int col_idx =  csr_col[i];
-       float results = 0;
-       int  index = (iter_num *CU_CSCMM_DMD2CSR_SHARE_X) * dimN + col_idx;
-       for (int k = 0; k < csr_rem; k++) {
-         results += A_s[k ] * B_d[k * dimN + index];
-       }
-       csr_val[i] += alpha * results;
-     }
-  }
-}
-
-
-/**
- *  @brief  Use to calculate row/col index for CSR/CSC sparse matrix
- *          according to csr_row(csc_col) and
- *          the value position in csr_val/csc_val
- *
- *  @param  indice      csr_row for hl_csr_matrix
- *                      csc_col for hl_csc_matrix
- *  @param  num         length of csr_row/csc_col
- *  @param  index       the value position in csr_val/csc_val
- *                      but need to add 1
- *                      that is, 1,2,3,...,nnz
- *  @note   the following kernels doesn't use findIndex,
- *          but may be used in the future.
- */
-__device__ __forceinline__
-int findIndex(int* indice, int num, int index) {
-  int start = 0;
-  int end = num - 1;
-  int mid = -1;
-  while (start < end) {
-    mid = start + ((end - start) / 2);
-    if (indice[mid] < index)
-      start = mid + 1;
-    else
-      end = mid;
-  }
-  return (end - 1);
-}
-
-
-/**
- * @brief sum columns of csr sparse matrix (csr_val), then add to a_val.
- *        This kernel used atomicAdd and adapted to w >> h, w is the
- *        width of csr, and h is the height of csr.
- */
-__global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
-                                      int* csr_col, const int dimNNZ) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
-    int colIdx = csr_col[idx];
-    real val = csr_val[idx];
-    paddle::paddleAtomicAdd(a_val + colIdx, val);
-  }
-}
-
-__global__ void KeSMatrixCsrAddBias(real* csr_val, int* csr_col, real* b_d,
-                                    real scale, const int nnz) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;  // global index
-  for (int idx = gid; idx < nnz; idx += gridDim.x * blockDim.x) {
-    int colIdx = csr_col[idx];
-    // not coalesced access to b_d
-    csr_val[idx] += scale * b_d[colIdx];
-  }
-}
-
-/**
- * @brief  csr sparse matrix add dense matrix.
- *         This kernel occurs load imbalances
- *         if number of each row is different greatly.
- */
-__global__ void KeSMatrixCsrAddDense(real* csr_val, int* csr_row,
-                                     int* csr_col, real* b_d, real alpha,
-                                     real beta, int dimM, int dimN) {
-  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
-  int gidy = blockIdx.y;
-  if (gidy < dimM) {
-    int start = csr_row[gidy];
-    int end = csr_row[gidy + 1];
-    for (int x = gidx; x < (end - start); x += gridDim.x * blockDim.x) {
-      int col = csr_col[start + x];
-      real val = csr_val[start + x];
-      csr_val[start + x] = beta * val + alpha * b_d[gidy * dimN + col];
-    }
-  }
-}
-
-#define CU_BLOCK_K 16
-#define CU_BLOCK_SIZE 128
-
-__global__ void KeSMatrixDenseMulDenseTrans2CSR(
-    real* csr_val, const int* csr_row, const int* csr_col, real* A_d,
-    real* B_d, bool trans_A, bool trans_B, int dimM, int dimN, int dimK,
-    real alpha, real beta) {
-
-  __shared__ real B_s[CU_BLOCK_SIZE][CU_BLOCK_K];
-  __shared__ real A_s[CU_BLOCK_K];
-
-  const int idx = threadIdx.x;
-
-  const int gidx_begin = blockIdx.x * CU_BLOCK_SIZE;
-  const int gidy = blockIdx.y;
-  const int gx_dim = gridDim.x * blockDim.x;
-
-  int start = csr_row[gidy];
-  int end = csr_row[gidy + 1];
-  int size = end - start;
-
-  int c_iter_num = (size + gx_dim - 1) / gx_dim;
-  int iter_num = (dimK + CU_BLOCK_K - 1) / CU_BLOCK_K;
-  for (int i = 0; i < c_iter_num; ++i) {
-    if ((gidx_begin + i * gx_dim) >= size) {
-      return;  // No need to calculate in this block.
-    }
-
-    real res = 0.0;
-    int c_idx = gidx_begin + i * gx_dim + idx;
-
-    for (int j = 0; j < iter_num; ++j) {
-      int col = j * CU_BLOCK_K + idx;
-      if (idx < CU_BLOCK_K) {
-        A_s[idx] = col < dimK ? A_d[gidy * dimK + col] : 0.0;
-      }
-      for (int m = 0; m < CU_BLOCK_K; ++m) {
-        int row = (idx / CU_BLOCK_K) + m * (CU_BLOCK_SIZE / CU_BLOCK_K);
-        col = idx % CU_BLOCK_K;
-        int csr_idx = gidx_begin + i * gx_dim + row;
-        int ldRow = csr_idx < size ? csr_col[start + csr_idx] : 0;
-        int ldCol = j * CU_BLOCK_K + col;
-        B_s[row][col] = (csr_idx < size && ldCol < dimK) ?
-                        B_d[ldRow * dimK + ldCol] : 0.0;
-      }
-      __syncthreads();
-
-      for (int k = 0; k < CU_BLOCK_K; k++) {
-        res += A_s[k] * B_s[idx][k];
-      }
-      __syncthreads();
-    }
-
-    if (c_idx < size) {
-      csr_val[start + c_idx] += alpha * res;
-    }
-  }
-}
diff --git a/paddle/legacy/cuda/src/hl_math.cc b/paddle/legacy/cuda/src/hl_math.cc
deleted file mode 100644
index 585b356d0..000000000
--- a/paddle/legacy/cuda/src/hl_math.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "avx_mathfun.h"
-
-namespace hppl {
-__m256 exp(__m256 a) { return exp256_ps(a); }
-
-__m256 log(__m256 a) { return log256_ps(a); }
-
-__m256 sin(__m256 a) { return sin256_ps(a); }
-
-__m256 cos(__m256 a) { return cos256_ps(a); }
-
-}  // namespace hppl
diff --git a/paddle/legacy/cuda/src/hl_perturbation_util.cu b/paddle/legacy/cuda/src/hl_perturbation_util.cu
deleted file mode 100644
index e15cbb143..000000000
--- a/paddle/legacy/cuda/src/hl_perturbation_util.cu
+++ /dev/null
@@ -1,289 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <cmath>
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_perturbation_util.cuh"
-#include "hl_time.h"
-
-#define _USE_MATH_DEFINES
-
-/*
- * Get the original coordinate for a pixel in a transformed image.
- * x, y: coordiate in the transformed image.
- * tgtCenter: the center coordiate of the transformed image.
- * imgSCenter: the center coordinate of the source image.
- * centerX, centerY: translation.
- * sourceX, sourceY: output coordinates in the original image.
- */
-__device__ void getTranformCoord(int x,
-                                 int y,
-                                 real theta,
-                                 real scale,
-                                 real tgtCenter,
-                                 real imgCenter,
-                                 real centerR,
-                                 real centerC,
-                                 int* sourceX,
-                                 int* sourceY) {
-  real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
-
-  // compute coornidates in the rotated and scaled image
-  real x_new = x - tgtCenter + centerC;
-  real y_new = y - tgtCenter + centerR;
-
-  // compute coornidates in the original image
-  x_new -= imgCenter;
-  y_new -= imgCenter;
-  real xx = H[0] * x_new + H[1] * y_new;
-  real yy = H[2] * x_new + H[3] * y_new;
-  *sourceX = __float2int_rn(xx / scale + imgCenter);
-  *sourceY = __float2int_rn(yy / scale + imgCenter);
-}
-
-/*
- * imgs:            (numImages, imgPixels)
- * target:          (numImages * samplingRate, tgtPixels)
- * the channels of one pixel are stored continuously in memory.
- *
- * created by Wei Xu (genome), converted by Jiang Wang
- */
-
-__global__ void kSamplingPatches(const real* imgs,
-                                 real* targets,
-                                 int imgSize,
-                                 int tgtSize,
-                                 const int channels,
-                                 int samplingRate,
-                                 const real* thetas,
-                                 const real* scales,
-                                 const int* centerRs,
-                                 const int* centerCs,
-                                 const real padValue,
-                                 const int numImages) {
-  const int caseIdx = blockIdx.x * 4 + threadIdx.x;
-  const int pxIdx = blockIdx.y * 128 + threadIdx.y;
-  const int imgPixels = imgSize * imgSize;
-  const int tgtPixels = tgtSize * tgtSize;
-  const int numPatches = numImages * samplingRate;
-
-  real tgtCenter = (tgtSize - 1) / 2;
-  real imgCenter = (imgSize - 1) / 2;
-
-  if (pxIdx < tgtPixels && caseIdx < numPatches) {
-    const int imgIdx = caseIdx / samplingRate;
-
-    // transform coordiates
-    const int pxX = pxIdx % tgtSize;
-    const int pxY = pxIdx / tgtSize;
-
-    int srcPxX, srcPxY;
-    getTranformCoord(pxX,
-                     pxY,
-                     thetas[imgIdx],
-                     scales[imgIdx],
-                     tgtCenter,
-                     imgCenter,
-                     centerCs[caseIdx],
-                     centerRs[caseIdx],
-                     &srcPxX,
-                     &srcPxY);
-
-    imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
-    targets += (caseIdx * tgtPixels + pxIdx) * channels;
-    if (srcPxX >= 0 && srcPxX < imgSize && srcPxY >= 0 && srcPxY < imgSize) {
-      for (int j = 0; j < channels; j++) targets[j] = imgs[j];
-    } else {
-      for (int j = 0; j < channels; j++) targets[j] = padValue;
-    }
-  }
-}
-
-/*
- * Functionality: generate the disturb (rotation and scaling) and
- *                sampling location sequence
- *
- * created by Wei Xu
- */
-void hl_generate_disturb_params(real*& gpuAngle,
-                                real*& gpuScaleRatio,
-                                int*& gpuCenterR,
-                                int*& gpuCenterC,
-                                int numImages,
-                                int imgSize,
-                                real rotateAngle,
-                                real scaleRatio,
-                                int samplingRate,
-                                bool isTrain) {
-  // The number of output samples.
-  int numPatches = numImages * samplingRate;
-
-  // create CPU perturbation parameters.
-  real* r_angle = new real[numImages];
-  real* s_ratio = new real[numImages];
-  int* center_r = new int[numPatches];
-  int* center_c = new int[numPatches];
-
-  // generate the random disturbance sequence and the sampling locations
-  if (isTrain) {  // random sampling for training
-    // generate rotation ans scaling parameters
-    // TODO(yuyang18): Since it will initialize random seed here, we can use
-    // rand_r instead of rand to make this method thread safe.
-    srand(getCurrentTimeStick());
-    for (int i = 0; i < numImages; i++) {
-      r_angle[i] =
-          (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          -
-                                          0.5);
-      s_ratio[i] =
-          1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
-    }
-
-    int imgCenter = (imgSize - 1) / 2;
-
-    // generate sampling location parameters
-    for (int i = 0; i < numImages; i++) {
-      int j = 0;
-      srand((unsigned)time(NULL));
-      while (j < samplingRate) {
-        int pxX =
-            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
-        int pxY =
-            (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
-
-        const real H[4] = {cos(-r_angle[i]),
-                           -sin(-r_angle[i]),
-                           sin(-r_angle[i]),
-                           cos(-r_angle[i])};
-        real x = pxX - imgCenter;
-        real y = pxY - imgCenter;
-        real xx = H[0] * x + H[1] * y;
-        real yy = H[2] * x + H[3] * y;
-
-        real srcPxX = xx / s_ratio[i] + imgCenter;
-        real srcPxY = yy / s_ratio[i] + imgCenter;
-
-        if (srcPxX >= 0 && srcPxX <= imgSize - 1 && srcPxY >= 0 &&
-            srcPxY <= imgSize - 1) {
-          center_r[i * samplingRate + j] = pxY;
-          center_c[i * samplingRate + j] = pxX;
-          j++;
-        }
-      }
-    }
-  } else {  // central crop for testing
-    for (int i = 0; i < numImages; i++) {
-      r_angle[i] = 0.0;
-      s_ratio[i] = 1.0;
-
-      for (int j = 0; j < samplingRate; j++) {
-        center_r[i * samplingRate + j] = (imgSize - 1) / 2;
-        center_c[i * samplingRate + j] = (imgSize - 1) / 2;
-      }
-    }
-  }
-
-  // copy disturbance sequence to gpu
-  hl_memcpy_host2device(gpuAngle, r_angle, sizeof(real) * numImages);
-  hl_memcpy_host2device(gpuScaleRatio, s_ratio, sizeof(real) * numImages);
-
-  delete[] r_angle;
-  delete[] s_ratio;
-
-  // copy sampling location sequence to gpu
-  hl_memcpy_host2device(gpuCenterR, center_r, sizeof(int) * numPatches);
-  hl_memcpy_host2device(gpuCenterC, center_c, sizeof(int) * numPatches);
-
-  delete[] center_r;
-  delete[] center_c;
-}
-
-void hl_conv_random_disturb_with_params(const real* images,
-                                        int imgSize,
-                                        int tgtSize,
-                                        int channels,
-                                        int numImages,
-                                        int samplingRate,
-                                        const real* gpuRotationAngle,
-                                        const real* gpuScaleRatio,
-                                        const int* gpuCenterR,
-                                        const int* gpuCenterC,
-                                        int paddingValue,
-                                        real* target) {
-  // The number of output samples.
-  int numPatches = numImages * samplingRate;
-  // The memory size of one output patch.
-  int targetSize = tgtSize * tgtSize;
-
-  dim3 threadsPerBlock(4, 128);
-  dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
-
-  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
-                                                   target,
-                                                   imgSize,
-                                                   tgtSize,
-                                                   channels,
-                                                   samplingRate,
-                                                   gpuRotationAngle,
-                                                   gpuScaleRatio,
-                                                   gpuCenterR,
-                                                   gpuCenterC,
-                                                   paddingValue,
-                                                   numImages);
-
-  hl_device_synchronize();
-}
-
-void hl_conv_random_disturb(const real* images,
-                            int imgSize,
-                            int tgtSize,
-                            int channels,
-                            int numImages,
-                            real scaleRatio,
-                            real rotateAngle,
-                            int samplingRate,
-                            real* gpu_r_angle,
-                            real* gpu_s_ratio,
-                            int* gpu_center_r,
-                            int* gpu_center_c,
-                            int paddingValue,
-                            bool isTrain,
-                            real* targets) {
-  // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle,
-                             gpu_s_ratio,
-                             gpu_center_r,
-                             gpu_center_c,
-                             numImages,
-                             imgSize,
-                             rotateAngle,
-                             scaleRatio,
-                             samplingRate,
-                             isTrain);
-
-  hl_conv_random_disturb_with_params(images,
-                                     imgSize,
-                                     tgtSize,
-                                     channels,
-                                     numImages,
-                                     samplingRate,
-                                     gpu_r_angle,
-                                     gpu_s_ratio,
-                                     gpu_center_r,
-                                     gpu_center_r,
-                                     paddingValue,
-                                     targets);
-}
diff --git a/paddle/legacy/cuda/src/hl_table_apply.cu b/paddle/legacy/cuda/src/hl_table_apply.cu
deleted file mode 100644
index 7411ae35d..000000000
--- a/paddle/legacy/cuda/src/hl_table_apply.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_device_functions.cuh"
-#include "paddle/legacy/utils/Logging.h"
-
-template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output,
-                                int ldo,
-                                real* table,
-                                int ldt,
-                                int* ids,
-                                int numSamples,
-                                int tableSize,
-                                int dim) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * gridDimX;
-
-  while (idy < numSamples) {
-    int tableId = ids[idy];
-    if ((0 <= tableId) && (tableId < tableSize)) {
-      real* out = output + idy * ldo;
-      real* tab = table + tableId * ldt;
-      for (int i = idx; i < dim; i += blockDimX) {
-        if (AddRow) {
-          paddle::paddleAtomicAdd(&tab[i], out[i]);
-        } else {
-          out[i] += tab[i];
-        }
-      }
-    }
-    idy += blockDimY * gridDimX;
-  }
-}
-
-void hl_matrix_select_rows(real* output,
-                           int ldo,
-                           real* table,
-                           int ldt,
-                           int* ids,
-                           int numSamples,
-                           int tableSize,
-                           int dim) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(table);
-  CHECK_NOTNULL(ids);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
-
-  CHECK_SYNC("hl_matrix_select_rows failed");
-}
-
-void hl_matrix_add_to_rows(real* table,
-                           int ldt,
-                           real* input,
-                           int ldi,
-                           int* ids,
-                           int numSamples,
-                           int tableSize,
-                           int dim) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(table);
-  CHECK_NOTNULL(ids);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
-
-  CHECK_SYNC("hl_matrix_add_to_rows failed");
-}
-
-template <class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-  int idx = threadIdx.x + blockDimX * blockIdx.x;
-  while (idx < sizei) {
-    int index = ids[idx];
-    // check(index < sizes);
-    dst[idx] = src[index];
-    idx += blockDimX * gridDimX;
-  }
-}
-
-template <class T>
-void hl_vector_select_from(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(ids);
-  CHECK_EQ(sized, sizei);
-
-  dim3 threads(512, 1);
-  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      dst, sized, src, sizes, ids, sizei);
-
-  CHECK_SYNC("hl_vector_select_from failed");
-}
-
-template void hl_vector_select_from(real* dst,
-                                    int sized,
-                                    const real* src,
-                                    int sizes,
-                                    const int* ids,
-                                    int sizei);
-template void hl_vector_select_from(
-    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
diff --git a/paddle/legacy/cuda/src/hl_time.cc b/paddle/legacy/cuda/src/hl_time.cc
deleted file mode 100644
index 26af9ec80..000000000
--- a/paddle/legacy/cuda/src/hl_time.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_time.h"
-#include <stdlib.h>
-#include <chrono>
-#include <cstdint>
-#include <iostream>
-
-using std::chrono::high_resolution_clock;
-
-int64_t getCurrentTimeStick() {
-  high_resolution_clock::time_point tp = high_resolution_clock::now();
-  high_resolution_clock::duration dtn = tp.time_since_epoch();
-  return dtn.count();
-}
diff --git a/paddle/legacy/cuda/src/hl_top_k.cu b/paddle/legacy/cuda/src/hl_top_k.cu
deleted file mode 100644
index 041ac419f..000000000
--- a/paddle/legacy/cuda/src/hl_top_k.cu
+++ /dev/null
@@ -1,481 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/cuda/include/hl_base.h"
-#include "paddle/legacy/cuda/include/hl_sparse.ph"
-#include "paddle/legacy/cuda/include/hl_top_k.h"
-#include "paddle/legacy/utils/Logging.h"
-
-// using namespace hppl;
-
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-
-  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
-
-  __device__ __forceinline__ void set(real value, int id) {
-    v_ = value;
-    id_ = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair& in) {
-    v_ = in.v_;
-    id_ = in.id_;
-  }
-
-  __device__ __forceinline__ bool operator<(const real value) const {
-    return (v_ < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair& in) const {
-    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair& in) const {
-    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
-  }
-
-  real v_;
-  int id_;
-};
-
-__device__ __forceinline__ void addTo(Pair topK[],
-                                      const Pair& p,
-                                      int beamSize) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int beamSize>
-__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(Pair topK[],
-                                        real* val,
-                                        int* col,
-                                        int idx,
-                                        int dim,
-                                        const Pair& max,
-                                        int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* src,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* val,
-                                              int* col,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(
-            topK + maxLength - beam, val, col, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void blockReduce(Pair* shTopK,
-                                            int* maxId,
-                                            Pair topK[],
-                                            real** topVal,
-                                            int** topIds,
-                                            int& beam,
-                                            int& beamSize,
-                                            const int tid,
-                                            const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < blockSize / 2) {
-      if (shTopK[tid] < shTopK[tid + blockSize / 2]) {
-        maxId[tid] = tid + blockSize / 2;
-      } else {
-        maxId[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
-          maxId[tid] = maxId[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = shTopK[maxId[0]].v_;
-      **topIds = shTopK[maxId[0]].id_;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxId[0]) beam++;
-    if (--beamSize == 0) break;
-    __syncthreads();
-
-    // NOTE(zcd): temporary solution
-    unsigned mask = 0u;
-    CREATE_SHFL_MASK(mask, true);
-
-    if (tid == maxId[0]) {
-      if (beam < maxLength) {
-        shTopK[tid] = topK[beam];
-      }
-    }
-    if (maxId[0] / 32 == warp) {
-      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal,
-                             int ldv,
-                             int* topIds,
-                             real* src,
-                             int lds,
-                             int dim,
-                             int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-template <int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal,
-                              int ldv,
-                              int* topIds,
-                              real* val,
-                              int* row,
-                              int* col,
-                              int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  int start = row[blockIdx.x];
-  int end = row[blockIdx.x + 1];
-  int dim = end - start;
-  val += start;
-  col += start;
-
-  if (beamSize > dim) {
-    // if the number of values to sort are less than the output size,
-    // use -1 to indicate the end of valid sorted values.
-    if (tid == 0) {
-      topIds[dim] = -1;
-    }
-
-    beamSize = dim;
-  }
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-void hl_matrix_top_k(real* topVal,
-                     int ldv,
-                     int* topIds,
-                     real* src,
-                     int lds,
-                     int dim,
-                     int beamSize,
-                     int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (beamSize > dim) beamSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, beamSize);
-
-  CHECK_SYNC("hl_matrix_top_k failed");
-}
-
-void hl_sparse_matrix_top_k(real* topVal,
-                            int ldv,
-                            int* topIds,
-                            hl_sparse_matrix_s src,
-                            int beamSize,
-                            int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
-
-  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
-    LOG(FATAL) << "parameter src is null!";
-  }
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
-
-  CHECK_SYNC("hl_sparse_matrix_top_k failed");
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal,
-                                                int ldv,
-                                                int* topIds,
-                                                real* src,
-                                                int lds,
-                                                int dim,
-                                                int beamSize,
-                                                int* label,
-                                                real* recResult) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-  int topkSize = beamSize;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-
-  __syncthreads();
-  if (tid == 0) {
-    for (int i = 0; i < topkSize; i++) {
-      if (*--topIds == label[blockIdx.x]) {
-        recResult[blockIdx.x] = 0;
-        break;
-      }
-      recResult[blockIdx.x] = 1.0f;
-    }
-  }
-}
-
-void hl_matrix_classification_error(real* topVal,
-                                    int ldv,
-                                    int* topIds,
-                                    real* src,
-                                    int lds,
-                                    int dim,
-                                    int topkSize,
-                                    int numSamples,
-                                    int* label,
-                                    real* recResult) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (topkSize > dim) topkSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
-
-  CHECK_SYNC("hl_matrix_top_k classification error failed");
-}
diff --git a/paddle/legacy/cuda/src/hl_warpctc_wrap.cc b/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
deleted file mode 100644
index 31a8652f1..000000000
--- a/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_warpctc_wrap.h"
-#include <mutex>
-#include "paddle/legacy/utils/DynamicLoader.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace dynload {
-
-std::once_flag warpctc_dso_flag;
-void* warpctc_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warpctc routine
- * via operator overloading. When PADDLE_USE_DSO is
- * false, you need to add the path of libwarp-ctc.so to
- * the linked-libs of paddle or to LD_PRELOAD.
- */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
-  struct DynLoad__##__name {                                           \
-    template <typename... Args>                                        \
-    auto operator()(Args... args) -> decltype(__name(args...)) {       \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
-      std::call_once(                                                  \
-          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
-    }                                                                  \
-  } __name;  // struct DynLoad__##__name
-
-// include all needed warp-ctc functions
-DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
-DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
-DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
-DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
-
-#undef DYNAMIC_LOAD_WARPCTC_WRAP
-
-} /* namespace dynload */
-
-#define WARPCTC_GET_VERSION dynload::get_warpctc_version
-#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
-
-static int g_warpctcVersion = -1;
-#ifndef PADDLE_TYPE_DOUBLE
-#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
-#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
-#else
-hl_warpctc_status_t fatal(...) {
-  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
-             << "] Error: not support double precision.";
-  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
-  // type value
-  return CTC_STATUS_EXECUTION_FAILED;
-}
-#define WARPCTC_COMPUTE_LOSS fatal
-#define WARPCTC_GET_WORKSPACE_SIZE fatal
-#endif
-
-/**
- * Check build-in warp-ctc function using glog and it also
- * support << operator for more details error info.
- */
-#define CHECK_WARPCTC(warpctcStat)                \
-  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
-      << "warp-ctc [version " << g_warpctcVersion \
-      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
-
-void hl_warpctc_init(const size_t blank,
-                     bool useGpu,
-                     hl_warpctc_options_t* options) {
-  CHECK_NOTNULL(options);
-
-  g_warpctcVersion = WARPCTC_GET_VERSION();
-
-  if (useGpu) {
-#ifdef __NVCC__
-    options->loc = CTC_GPU;
-    options->stream = STREAM_DEFAULT;
-#else
-    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
-#endif
-  } else {
-    options->loc = CTC_CPU;
-    options->num_threads = 1;
-  }
-
-  options->blank_label = blank;
-}
-
-void hl_warpctc_compute_loss(const real* batchInput,
-                             real* batchGrad,
-                             const int* cpuLabels,
-                             const int* cpuLabelLengths,
-                             const int* cpuInputLengths,
-                             const size_t numClasses,
-                             const size_t numSequences,
-                             real* cpuCosts,
-                             void* workspace,
-                             hl_warpctc_options_t* options) {
-  CHECK_NOTNULL(batchInput);
-  CHECK_NOTNULL(cpuLabels);
-  CHECK_NOTNULL(cpuLabelLengths);
-  CHECK_NOTNULL(cpuInputLengths);
-  CHECK_NOTNULL(cpuCosts);
-  CHECK_NOTNULL(workspace);
-  CHECK_NOTNULL(options);
-
-  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
-                                     batchGrad,
-                                     cpuLabels,
-                                     cpuLabelLengths,
-                                     cpuInputLengths,
-                                     numClasses,
-                                     numSequences,
-                                     cpuCosts,
-                                     workspace,
-                                     *options));
-}
-
-void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
-                                   const int* cpuInputLengths,
-                                   const size_t numClasses,
-                                   const size_t numSequences,
-                                   hl_warpctc_options_t* options,
-                                   size_t* bytes) {
-  CHECK_NOTNULL(cpuLabelLengths);
-  CHECK_NOTNULL(cpuInputLengths);
-  CHECK_NOTNULL(options);
-  CHECK_NOTNULL(bytes);
-
-  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
-                                           cpuInputLengths,
-                                           numClasses,
-                                           numSequences,
-                                           *options,
-                                           bytes));
-}
diff --git a/paddle/legacy/function/BlockExpandOp.cpp b/paddle/legacy/function/BlockExpandOp.cpp
deleted file mode 100644
index f01f89a72..000000000
--- a/paddle/legacy/function/BlockExpandOp.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include "Im2Col.h"
-
-namespace paddle {
-
-/*
- * \brief Converts the image data of four dimensions(NCHW) into
- *        a sequence data of three dimensions(NST) in the forward calculation,
- *        which is reversed in the backward calculation.
- *        Where N is batch size, S is the length of the sequence after each
- *        image is expanded, T is the size of each time step in the sequence.
- *
- * Arguments in forward function:
- * \param inputs[0]  Image data of NCHW format.
- * \param outputs[0] Sequence data of NST format.
- *
- * Arguments in backward function:
- * \param inputs[0]  Sequence data of NST format.
- * \param outputs[0] Image data of NCHW format.
- */
-class BlockExpandFunction : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    strides_ = config.get<std::vector<size_t>>("strides");
-    paddings_ = config.get<std::vector<size_t>>("paddings");
-    blocks_ = config.get<std::vector<size_t>>("blocks");
-
-    // number of inputs and outputs
-    numInputs_ = 1;
-    numOutputs_ = 1;
-  }
-
-  void checkShape(const TensorShape& image, const TensorShape& sequence) const {
-    // image shape should be 4-dimensional.
-    CHECK_EQ(image.ndims(), (size_t)4);
-    // sequence shape should be 3-dimensional.
-    CHECK_EQ(sequence.ndims(), (size_t)3);
-    // The batchSize of the image needs to be equal to
-    // the batchSize of the sequence.
-    CHECK_EQ(image[0], sequence[0]);
-  }
-
-  // Calculate the shape of colData based on the shape of the image
-  // and the shape of the sequence.
-  TensorShape getColShape(const TensorShape& image,
-                          const TensorShape& sequence) const {
-    size_t inputChannels = image[1];
-    size_t inputHeight = image[2];
-    size_t inputWidth = image[3];
-    size_t seqLength = sequence[1];
-    size_t stepSize = sequence[2];
-    size_t outputHeight =
-        1 +
-        (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
-    size_t outputWidth =
-        1 +
-        (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
-    CHECK_EQ(seqLength, outputHeight * outputWidth);
-    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
-
-    // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
-    return TensorShape({outputHeight,
-                        outputWidth,
-                        inputChannels,
-                        (size_t)blockH(),
-                        (size_t)blockW()});
-  }
-
- protected:
-  std::vector<size_t> strides_;
-  std::vector<size_t> paddings_;
-  std::vector<size_t> blocks_;
-
-  inline int strideH() const { return strides_[0]; }
-
-  inline int strideW() const { return strides_[1]; }
-
-  inline int paddingH() const { return paddings_[0]; }
-
-  inline int paddingW() const { return paddings_[1]; }
-
-  inline int blockH() const { return blocks_[0]; }
-
-  inline int blockW() const { return blocks_[1]; }
-};
-
-template <DeviceType Device>
-class BlockExpandForward : public BlockExpandFunction {
- public:
-  void init(const FuncConfig& config) override {
-    BlockExpandFunction::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& image = inputs[0].shape();
-    const TensorShape& sequence = outputs[0].shape();
-    checkShape(image, sequence);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    const TensorShape& image = inputs[0].shape();
-    const TensorShape& sequence = outputs[0].shape();
-
-    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
-    TensorShape colShape = getColShape(image, sequence);
-    size_t batchSize = image[0];
-
-    real* imageData = inputs[0].data<real>();
-    real* seqData = outputs[0].data<real>();
-    Im2ColFunctor<kOCF, Device, real> im2col;
-    for (size_t i = 0; i < batchSize; i++) {
-      // The result of im2col is [outputHeight, outputWidth,
-      // inputChannels, filterHeight, filterWidth], and it is easy to
-      // reshape into [seqLength, stepSize], where seqLength is equal
-      // output_height * output_width, stepSize is equal
-      // input_channels * filter_height * filter_width
-      im2col(imageData,
-             imShape,
-             seqData,
-             colShape,
-             strideH(),
-             strideW(),
-             paddingH(),
-             paddingW());
-      imageData += imShape.getElements();
-      seqData += colShape.getElements();
-    }
-  }
-};
-
-template <DeviceType Device>
-class BlockExpandBackward : public BlockExpandFunction {
- public:
-  void init(const FuncConfig& config) override {
-    BlockExpandFunction::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& image = outputs[0].shape();
-    const TensorShape& sequence = inputs[0].shape();
-    checkShape(image, sequence);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // Since the implementation of Col2ImFunctor is ADD_TO,
-    // this function only supports ADD_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& image = outputs[0].shape();
-    const TensorShape& sequence = inputs[0].shape();
-
-    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
-    TensorShape colShape = getColShape(image, sequence);
-    size_t batchSize = image[0];
-
-    real* imageData = outputs[0].data<real>();
-    real* seqData = inputs[0].data<real>();
-    Col2ImFunctor<kOCF, Device, real> col2im;
-    for (size_t i = 0; i < batchSize; i++) {
-      col2im(imageData,
-             imShape,
-             seqData,
-             colShape,
-             strideH(),
-             strideW(),
-             paddingH(),
-             paddingW());
-      imageData += imShape.getElements();
-      seqData += colShape.getElements();
-    }
-  }
-};
-
-REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
-REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
-REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/BlockExpandOpTest.cpp b/paddle/legacy/function/BlockExpandOpTest.cpp
deleted file mode 100644
index 8fca4f6fd..000000000
--- a/paddle/legacy/function/BlockExpandOpTest.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(BlockExpandForward, real) {
-  for (size_t batchSize : {5}) {
-    for (size_t channels : {1, 5}) {
-      for (size_t inputHeight : {5, 33}) {
-        for (size_t inputWidth : {5, 32}) {
-          for (size_t block : {1, 3, 5}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                // init Test object
-                std::vector<size_t> strides = {stride, stride};
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> blocks = {block, block};
-                CpuGpuFuncCompare test("BlockExpand",
-                                       FuncConfig()
-                                           .set("strides", strides)
-                                           .set("paddings", paddings)
-                                           .set("blocks", blocks));
-
-                size_t outputHeight =
-                    1 +
-                    (inputHeight + 2 * padding - block + stride - 1) / stride;
-                size_t outputWidth =
-                    1 +
-                    (inputWidth + 2 * padding - block + stride - 1) / stride;
-                TensorShape inputShape =
-                    TensorShape({batchSize, channels, inputHeight, inputWidth});
-                TensorShape outputShape =
-                    TensorShape({batchSize,
-                                 outputHeight * outputWidth,
-                                 channels * block * block});
-                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, inputShape));
-                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
-                // run Function
-                test.run();
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(BlockExpandBackward, real) {
-  for (size_t batchSize : {5}) {
-    for (size_t channels : {1, 5}) {
-      for (size_t inputHeight : {5, 33}) {
-        for (size_t inputWidth : {5, 32}) {
-          for (size_t block : {1, 3, 5}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                // init Test object
-                std::vector<size_t> strides = {stride, stride};
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> blocks = {block, block};
-                CpuGpuFuncCompare test("BlockExpandGrad",
-                                       FuncConfig()
-                                           .set("strides", strides)
-                                           .set("paddings", paddings)
-                                           .set("blocks", blocks));
-
-                size_t outputHeight =
-                    1 +
-                    (inputHeight + 2 * padding - block + stride - 1) / stride;
-                size_t outputWidth =
-                    1 +
-                    (inputWidth + 2 * padding - block + stride - 1) / stride;
-                TensorShape inputShape =
-                    TensorShape({batchSize, channels, inputHeight, inputWidth});
-                TensorShape outputShape =
-                    TensorShape({batchSize,
-                                 outputHeight * outputWidth,
-                                 channels * block * block});
-                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
-                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inputShape),
-                                ADD_TO);
-                // run Function
-                test.run();
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArg.cpp b/paddle/legacy/function/BufferArg.cpp
deleted file mode 100644
index 1f3d505c3..000000000
--- a/paddle/legacy/function/BufferArg.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include "BufferArg.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-
-const SequenceArg& BufferArg::sequence() const {
-  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
-  return dynamic_cast<const SequenceArg&>(*this);
-}
-
-const SparseMatrixArg& BufferArg::sparse() const {
-  CHECK_EQ(bufferType_, TENSOR_SPARSE);
-  return dynamic_cast<const SparseMatrixArg&>(*this);
-}
-
-SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArg.h b/paddle/legacy/function/BufferArg.h
deleted file mode 100644
index 1f47ad556..000000000
--- a/paddle/legacy/function/BufferArg.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-#include "TensorShape.h"
-#include "TensorType.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-enum BufferType {
-  TENSOR_UNKNOWN = 0,
-  TENSOR_NORMAL = 1,
-  TENSOR_SEQUENCE_ID = 2,
-  TENSOR_SEQUENCE_DATA = 3,
-  TENSOR_SPARSE = 4
-};
-
-class BufferArg;
-class SequenceArg;
-class SparseMatrixArg;
-
-/**
- * \brief BufferArg used as the argument type of Function.
- *
- * The arguments of the Paddle Function have four Buffer types.
- * 1. BufferArg for a dense Buffer of any dimension.
- * 2. SequenceIdArg for a Buffer of sequence start positions.
- * 3. SequenceArg for a Buffer of sequence data.
- * 4. SparseMatrixArg for a Buffer of sparse matrix.
- *
- * Buffer shape
- * For most buffers, the first dimension `shape()[0]` represents
- * the size of the mini-batch.
- *
- * Buffer argType
- * There is an ArgType property for the BufferArg used as Function Output.
- * Whether the result of the Function calculation is assigned to the
- * output Buffer or added to the output Buffer is determined by the
- * argType_ property of the output BufferArg.
- */
-
-// ArgType is only used by output BufferArg.
-// For input argument, argType_ is ignored.
-// For output argument, need to set the argType_ of the BufferArg.
-enum ArgType {
-  UNSPECIFIED = 0,
-  ASSIGN_TO = 1,
-  ADD_TO = 2,
-};
-class BufferArg {
- public:
-  void setArgType(ArgType argType) { argType_ = argType; }
-
-  ArgType getArgType() const { return argType_; }
-
- public:
-  BufferArg(ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf,
-            ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(2),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, matrix.getHeight());
-    shape_.setDim(1, matrix.getWidth());
-  }
-
-  BufferArg(const Matrix& matrix,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(shape),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
-  }
-
-  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(VALUE_TYPE_INT32),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::Matrix matrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)2, shape_.ndims());
-    return typename Tensor<real, DType>::Matrix(
-        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
-  }
-
-  template <typename VType, DeviceType DType>
-  typename Tensor<VType, DType>::Vector vector() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<VType>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)1, shape_.ndims());
-    return typename Tensor<VType, DType>::Vector(
-        shape_[0], reinterpret_cast<VType*>(buf_));
-  }
-
-  virtual ~BufferArg() {}
-
-  template <typename T>
-  T* data() const {
-    return reinterpret_cast<T*>(buf_);
-  }
-
-  void* data() const { return buf_; }
-  ValueType valueType() const { return valueType_; }
-  BufferType bufferType() const { return bufferType_; }
-  const TensorShape& shape() const { return shape_; }
-  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
-  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
-  virtual size_t numElements() const { return shape_.getElements(); }
-
-  const SequenceArg& sequence() const;
-  const SparseMatrixArg& sparse() const;
-
- protected:
-  void* buf_;
-  ValueType valueType_;
-  TensorShape shape_;
-  BufferType bufferType_{TENSOR_UNKNOWN};
-  ArgType argType_{UNSPECIFIED};
-  // TODO(tianbing), add deviceType_
-  // leading dimensions. The size is dims_.size()
-  // Dims lds_;
-};
-
-// sequence start positions in a mini-batch of sequences
-// shape_.ndims() == 1
-// valueType_ = int32
-// if a < b then value_.buf_[a] < value_.buf_[b]
-class SequenceIdArg : public BufferArg {
- public:
-  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
-      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    CHECK_GE(shape_[0], 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(void* buf,
-                const TensorShape& shape,
-                ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  ~SequenceIdArg() {}
-
-  size_t numSeqs() const { return numSeqs_; }
-
- private:
-  size_t numSeqs_;
-};
-
-// sequences data
-// For mini-batch calculate,
-// one batch can contain more than one sequence of data.
-// SequenceArg can be used to represent sequences that contain multiple
-// unequal lengths.
-class SequenceArg : public BufferArg {
- public:
-  SequenceArg(ValueType valueType,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        startPositions_(TensorShape({shape[0]})) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(void* buf,
-              ValueType valueType,
-              const TensorShape& shape,
-              const SequenceIdArg& startPositions,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  ~SequenceArg() {}
-
-  void* getIdBuf() const { return startPositions_.data(); }
-  size_t numSeqs() const { return startPositions_.numSeqs(); }
-  SequenceIdArg& getSequenceId() { return startPositions_; }
-  const SequenceIdArg& getSequenceId() const { return startPositions_; }
-
- private:
-  SequenceIdArg startPositions_;
-};
-
-// sparse matrix
-// valueType_ == float or double
-// shape_.ndims() == 2
-class SparseMatrixArg : public BufferArg {
- public:
-  SparseMatrixArg(void* buf,
-                  ValueType valueType,
-                  const TensorShape& shape,
-                  const BufferArg& row,
-                  const BufferArg& col,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        row_(row),
-        col_(col),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-    CHECK_EQ(row_.shape().ndims(), 1UL);
-    CHECK_EQ(col_.shape().ndims(), 1UL);
-    if (format_ == T_SPARSE_CSR) {
-      CHECK_EQ(nnz, col.shape()[0]);
-    } else if (format_ == T_SPARSE_CSC) {
-      CHECK_EQ(nnz, row.shape()[0]);
-    }
-  }
-
-  SparseMatrixArg(ValueType valueType,
-                  const TensorShape& shape,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-
-    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
-    row_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
-    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
-    col_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
-  }
-
-  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ(2UL, shape_.ndims());
-    return typename Tensor<real, DType>::SparseMatrix(
-        reinterpret_cast<real*>(buf_),
-        reinterpret_cast<int*>(row_.data()),
-        reinterpret_cast<int*>(col_.data()),
-        shape_[0],
-        shape_[1],
-        nnz_,
-        static_cast<SparseValueType>(type_),
-        static_cast<SparseFormat>(format_),
-        false);
-  }
-
-  ~SparseMatrixArg() {}
-
-  void* getRowBuf() const { return row_.data(); }
-
-  void* getColBuf() const { return col_.data(); }
-
-  size_t nnz() const { return nnz_; }
-
-  size_t numElements() const override { return nnz_; }
-
-  SparseDataFormat dataFormat() const { return format_; }
-
-  SparseDataType dataType() const { return type_; }
-
- private:
-  BufferArg row_;
-  BufferArg col_;
-  size_t nnz_;
-  SparseDataFormat format_;
-  SparseDataType type_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArgTest.cpp b/paddle/legacy/function/BufferArgTest.cpp
deleted file mode 100644
index 1ec153bea..000000000
--- a/paddle/legacy/function/BufferArgTest.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BufferArg.h"
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/MemoryHandle.h"
-
-namespace paddle {
-
-TEST(BufferTest, BufferArg) {
-  TensorShape shape({8, 10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_FLOAT));
-  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-}
-
-TEST(BufferTest, SequenceIdArg) {
-  TensorShape shape({10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_INT32));
-  SequenceIdArg buffer(memory.getBuf(), shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-  EXPECT_EQ(buffer.numSeqs(), 9U);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CMakeLists.txt b/paddle/legacy/function/CMakeLists.txt
deleted file mode 100644
index 29b4ac098..000000000
--- a/paddle/legacy/function/CMakeLists.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-file(GLOB h_files . *Op.h)
-file(GLOB cpp_files . *Op.cpp)
-
-list(APPEND h_files Function.h)
-list(APPEND cpp_files Function.cpp)
-list(APPEND cpp_files BufferArg.cpp)
-list(APPEND cpp_files GemmFunctor.cpp)
-if(USE_EIGEN_FOR_BLAS)
-  list(APPEND cpp_files EigenGemm.cpp)
-endif(USE_EIGEN_FOR_BLAS)
-
-if(WITH_GPU)
-    file(GLOB cu_files . *OpGpu.cu)
-    cuda_compile(cu_objs ${cu_files})
-endif()
-
-if(USE_NNPACK)
-  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
-  if(WITH_TESTING)
-    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
-  endif()
-endif()
-
-list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
-
-add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
-add_dependencies(paddle_function ${external_project_dependencies})
-add_dependencies(paddle_function paddle_proto)
-
-if(WITH_TESTING)
-if(WITH_GPU)
-    # TODO:
-    # file(GLOB test_files . *OpTest.cpp)
-    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    add_simple_unittest(CrossMapNormalOpTest)
-    add_simple_unittest(TensorShapeTest)
-    add_simple_unittest(TensorTypeTest)
-    add_simple_unittest(BufferArgTest)
-    add_simple_unittest(FunctionTest)
-    add_simple_unittest(ContextProjectionOpTest)
-    add_simple_unittest(PadOpTest)
-    add_simple_unittest(MulOpTest)
-    add_simple_unittest(CosSimOpTest)
-    add_simple_unittest(RowConvOpTest)
-    add_simple_unittest(BlockExpandOpTest)
-    add_simple_unittest(CropOpTest)
-    add_simple_unittest(SwitchOpTest)
-    add_simple_unittest(ScaleSubRegionOpTest)
-endif()
-
-add_simple_unittest(Im2ColTest)
-add_simple_unittest(GemmConvOpTest)
-add_simple_unittest(DepthwiseConvOpTest)
-endif()
diff --git a/paddle/legacy/function/ContextProjectionOp.cpp b/paddle/legacy/function/ContextProjectionOp.cpp
deleted file mode 100644
index 05a3f9158..000000000
--- a/paddle/legacy/function/ContextProjectionOp.cpp
+++ /dev/null
@@ -1,412 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjectionOp.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-/**
- * Context Projection Forward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                               const CpuMatrix& input_mat,
-                                               const CpuMatrix& weight_mat,
-                                               const CpuIVector& seq_vec,
-                                               size_t context_length,
-                                               int context_start,
-                                               size_t begin_pad) {
-  const int* starts = seq_vec.getData();
-  const size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat)
-                  .subMatrix(begin_pad + context_start + j - pad_size,
-                             pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src =
-          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
-      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
-      dst->addAtOffset(*src, j * input_mat.getWidth());
-    }
-  }
-}
-
-/**
- * Paddle Function for Context Projection Forward.
- * Calculate the output layer value sequence after context projection.
- *
- * What is Context Projection for a sequence?
- * For example, assumed input (x) has 4 words and the dimension of each word
- * representation is 2. If we use zero to pad instead of learned weight to pad,
- * and the context_lenth is 3, the output (y) is:
- *
- * @code
- *  x = [a1, a2;
- *       b1, b2;
- *       c1, c2;
- *       d1, d2]
- *  y = [0,  0,  a1, a2, b1, b2;
- *       a1, a2, b1, b2, c1, c2;
- *       b1, b2, c1, c2, d1, d2;
- *       c1, c2, d1, d2, 0,  0]
- * @endcode
- *
- * \param outputs[0].matrix   output layer value, n * (d * l)
- * \param outputs[0].vector   start position sequence, n * 1
- * \param inputs[0].matrix    input layer value, n * d
- * \param inputs[0].vector    start position sequence, n * 1
- * \param inputs[1].matrix    input layer weight, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionForwardFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(1UL == inputs.size() || 2UL == inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
-    if (2UL == inputs.size()) {
-      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-      /// dim of input == dim of weight
-      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
-    }
-
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-    auto out_mat = out_seq.matrix<Device>();
-    const auto in_mat = val_seqs.matrix<Device>();
-    const auto w_mat =
-        (2UL == inputs.size() && inputs[1].data())
-            ? inputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
-
-    ContextProjectionForward<Device>(out_mat,
-                                     in_mat,
-                                     w_mat,
-                                     seq_vec,
-                                     context_length_,
-                                     context_start_,
-                                     begin_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-};
-
-/**
- * Context Projection Backward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
-                                                CpuMatrix& in_grad_mat,
-                                                CpuMatrix& w_grad_mat,
-                                                const CpuIVector& seq_vec,
-                                                size_t context_length,
-                                                int context_start,
-                                                size_t begin_pad,
-                                                bool is_padding,
-                                                size_t total_pad) {
-  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
-                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
-  const int* starts = seq_vec.getData();
-  size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i], pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i + 1] - pad_size, pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(
-              begin_pad + context_start + j - pad_size, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!in_grad_mat) continue;
-      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
-                          .subMatrix(dst_begin, dst_end - dst_begin);
-      src->addAtOffset(*dst, j * input_dim);
-    }
-  }
-}
-
-/**
- * Context Projection Backward Function.
- * Update the weight gradient and input layer gradient with backprop
- *
- * \param inputs[0].matrix          output layer grad, n * (d * l)
- * \param inputs[0].vector          start position sequence, n * 1
- * \param outputs[0].matrix         input layer grad, n * d
- * \param outputs[0].vector         start position sequence, n * 1
- * \param outputs[1]                weight grad, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionBackwardFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    is_padding_ = config.get<bool>("is_padding");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK(1UL == outputs.size() || 2UL == outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
-
-    /// input and output grad has the same batch_size
-    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
-    /// dim of output grad = dim of input grad * context_length
-    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-
-    if (2UL == outputs.size()) {
-      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
-      /// dim of input grad == dim of weight
-      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
-      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    }
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto in_grad_mat =
-        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                        : out_seq.matrix<Device>();
-    auto w_grad_mat =
-        (2UL == outputs.size() && outputs[1].data())
-            ? outputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-
-    ContextProjectionBackward<Device>(out_grad_mat,
-                                      in_grad_mat,
-                                      w_grad_mat,
-                                      seq_vec,
-                                      context_length_,
-                                      context_start_,
-                                      begin_pad_,
-                                      is_padding_,
-                                      total_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  bool is_padding_;
-  size_t total_pad_;
-};
-
-/**
- * Context Projection Backward Data Function
- * Update input layer grad
- * input:  sequence of output layer grad
- * output: sequence of input layer grad
- *
- * \param outputs[0].matrix              input layer grad, n * d
- * \param outputs[0].vector              start position sequence, n * 1
- * \param inputs[0].matrix               output layer grad, n * (d * l)
- * \param inputs[0].vector               start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardDataFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    /// output layer grad dim == input layer grad dim * context_length_
-    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    auto in_grad_mat = out_seq.matrix<Device>();
-
-    ContextProjectionBackwardData<Device>(
-        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-};
-
-/**
- * Context Projection Backward Weight Function
- * Update weight grad by backprop
- * input:  sequence of output layer grad
- * output: weight grad
- *
- * \param outputs[0]                   weight grad, pad * d
- * \param inputs[0].matrix             output layer grad, n * (d * l)
- * \param inputs[0].vecotr             start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardWeightFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
-    /// output layer grad dim == weight dim * context_length_
-    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto w_grad_mat = outputs[0].matrix<Device>();
-    ContextProjectionBackwardWeight<Device>(out_grad_mat,
-                                            w_grad_mat,
-                                            seq_vec,
-                                            context_length_,
-                                            context_start_,
-                                            total_pad_,
-                                            begin_pad_);
-  }
-
- private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  size_t total_pad_;
-};
-
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    CPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    CPU,
-                    ContextProjectionBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    GPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    GPU,
-                    ContextProjectionBackwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
-                    GPU,
-                    ContextProjectionBackwardDataFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
-                    GPU,
-                    ContextProjectionBackwardWeightFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/function/ContextProjectionOp.h b/paddle/legacy/function/ContextProjectionOp.h
deleted file mode 100644
index 822734a78..000000000
--- a/paddle/legacy/function/ContextProjectionOp.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief   Context Projection Forward.
- *
- * \param[in/out]  outputs           output data.
- * \param[in]      input             input data.
- * \param[in]      weight            input weight.
- * \param[in]      sequence          input data.
- * \param[in]      context_length    consecutive rows for concatenation.
- * \param[in]      context_start     context start position.
- * \param[in]      begin_pad         begining pad position.
- * \param[in]      is_padding        whether padding 0 or not.
- *
- */
-template <DeviceType DType>
-void ContextProjectionForward(
-    typename Tensor<real, DType>::Matrix& output,
-    const typename Tensor<real, DType>::Matrix& input,
-    const typename Tensor<real, DType>::Matrix& weight,
-    const typename Tensor<int, DType>::Vector& sequence,
-    size_t context_length,
-    int context_start,
-    size_t begin_pad);
-
-/**
- * \brief   Context Projection Backward.
- *
- * \param[out]  outputs           output gradient.
- * \param[in]   input             input gradient.
- * \param[in]   weight            input weight gradient.
- * \param[in]   sequence          input data.
- * \param[in]   context_length    consecutive rows for concatenation.
- * \param[in]   context_start     context start position.
- * \param[in]   begin_pad         begining pad position.
- * \param[in]   is_padding        whether padding 0 or not.
- *
- */
-template <DeviceType DType>
-void ContextProjectionBackward(
-    const typename Tensor<real, DType>::Matrix& out_grad,
-    typename Tensor<real, DType>::Matrix& in_grad,
-    typename Tensor<real, DType>::Matrix& w_grad,
-    const typename Tensor<int, DType>::Vector& seq_vec,
-    size_t context_length,
-    int context_start,
-    size_t begin_pad,
-    bool is_padding,
-    size_t total_pad);
-
-template <DeviceType DType>
-void ContextProjectionBackwardData(
-    const typename Tensor<real, DType>::Matrix& out_grad,
-    typename Tensor<real, DType>::Matrix& in_grad,
-    const typename Tensor<int, DType>::Vector& sequence,
-    size_t context_length,
-    int context_start);
-
-template <DeviceType DType>
-void ContextProjectionBackwardWeight(
-    const typename Tensor<real, DType>::Matrix& out_grad,
-    typename Tensor<real, DType>::Matrix& w_grad,
-    const typename Tensor<int, DType>::Vector& seq_vec,
-    size_t context_length,
-    int context_start,
-    size_t total_pad,
-    size_t begin_pad);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ContextProjectionOpGpu.cu b/paddle/legacy/function/ContextProjectionOpGpu.cu
deleted file mode 100644
index 0a4d865e2..000000000
--- a/paddle/legacy/function/ContextProjectionOpGpu.cu
+++ /dev/null
@@ -1,413 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjectionOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-template <bool padding>
-__global__ void KeContextProjectionForward(const real* input,
-                                           const int* sequence,
-                                           const real* weight,
-                                           real* output,
-                                           int input_dim,
-                                           int context_length,
-                                           int context_start,
-                                           int begin_pad) {
-  int idx = threadIdx.x;
-  int block_size = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId + 1];
-  real value = 0;
-
-  int instances = seq_end - seq_start + context_length - 1;
-  output += seq_start * input_dim * context_length;
-  input += seq_start * input_dim;
-  for (int k = 0; k <= input_dim / block_size; k++) {
-    if (idx < input_dim) {
-      for (int i = 0; i < instances; i++) {
-        // i + context_start;
-        if ((i + context_start) < 0) {
-          if (padding) {
-            value = weight[i * input_dim + idx];
-          } else {
-            continue;
-          }
-        } else if ((i + context_start) >= (seq_end - seq_start)) {
-          if (padding) {
-            value =
-                weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
-                           input_dim +
-                       idx];
-          } else {
-            continue;
-          }
-        } else {
-          value = input[(i + context_start) * input_dim + idx];
-        }
-
-        int outx = (i - context_length) < 0 ? i : (context_length - 1);
-        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
-        real* output_r =
-            output + outy * input_dim * context_length + outx * input_dim;
-        for (int j = outy; j < seq_end - seq_start; j++) {
-          output_r[idx] += value;
-          if (j - outy == outx) break;
-          output_r += (context_length - 1) * input_dim;
-        }
-      }
-    }
-    idx += block_size;
-  }
-}
-
-/**
- * @brief   Context projection forward.
- *
- * @param[in]   input           input sequence.
- * @param[in]   sequence        sequence index.
- * @param[in]   weight          padding data.
- * @param[out]  output          output sequence.
- * @param[in]   num_sequences    number of sequences.
- * @param[in]   input_dim        input sequence dimension.
- * @param[in]   context_length   context length.
- * @param[in]   context_start    context start.
- * @param[in]   begin_pad        number of extra timesteps added at the
- * beginning.
- *
- */
-void hl_context_projection_forward(const real* input,
-                                   const int* sequence,
-                                   const real* weight,
-                                   real* output,
-                                   size_t num_sequences,
-                                   size_t input_dim,
-                                   size_t context_length,
-                                   int context_start,
-                                   size_t begin_pad) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-
-  int block_size = 128;
-  int blocks_x = num_sequences;
-  int blocks_y = 1;
-  dim3 threads(block_size, 1);
-  dim3 grid(blocks_x, blocks_y);
-
-  if (weight) {
-    KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        input,
-        sequence,
-        weight,
-        output,
-        input_dim,
-        context_length,
-        context_start,
-        begin_pad);
-  } else {
-    KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        input,
-        sequence,
-        weight,
-        output,
-        input_dim,
-        context_length,
-        context_start,
-        begin_pad);
-  }
-  CHECK_SYNC("hl_context_projection_forward failed");
-}
-
-template <>
-void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
-                                               const GpuMatrix& input,
-                                               const GpuMatrix& weight,
-                                               const GpuIVector& sequence,
-                                               size_t context_length,
-                                               int context_start,
-                                               size_t begin_pad) {
-  hl_context_projection_forward(input.getData(),
-                                sequence.getData(),
-                                weight ? weight.getData() : nullptr,
-                                output.getData(),
-                                sequence.getSize() - 1,
-                                input.getWidth(),
-                                context_length,
-                                context_start,
-                                begin_pad);
-}
-
-__global__ void KeContextProjectionBackwardData(const real* out_grad,
-                                                const int* sequence,
-                                                real* in_grad,
-                                                size_t input_dim,
-                                                int context_length,
-                                                int context_start) {
-  int idx = threadIdx.x;
-  int block_size = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId + 1];
-  real value = 0;
-
-  int instances = seq_end - seq_start + context_length - 1;
-  auto out = const_cast<real*>(out_grad);
-  out += seq_start * input_dim * context_length;
-  in_grad += seq_start * input_dim;
-  for (int k = 0; k <= input_dim / block_size; k++) {
-    if (idx < input_dim) {
-      for (int i = 0; i < instances; i++) {
-        if ((i + context_start) < 0) {
-          continue;
-        } else if ((i + context_start) >= (seq_end - seq_start)) {
-          continue;
-        } else {
-          // value = 0;
-          value = in_grad[(i + context_start) * input_dim + idx];
-        }
-
-        int outx = (i - context_length) < 0 ? i : (context_length - 1);
-        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
-        real* output_r =
-            out + outy * input_dim * context_length + outx * input_dim;
-        for (int j = outy; j < seq_end - seq_start; j++) {
-          value += output_r[idx];
-          if (j - outy == outx) break;
-          output_r += (context_length - 1) * input_dim;
-        }
-        in_grad[(i + context_start) * input_dim + idx] = value;
-      }
-    }
-    idx += block_size;
-  }
-}
-
-/**
- * @brief   Context projection backward data.
- *
- * @param[in]   out_grad         output gradient.
- * @param[in]   sequence         sequence index.
- * @param[out]  input_grad       input gradient.
- * @param[in]   num_sequences    number of sequences.
- * @param[in]   input_dim        input sequence dimension.
- * @param[in]   context_length   context length.
- * @param[in]   context_start    context start.
- *
- */
-void hl_context_projection_backward_data(const real* out_grad,
-                                         const int* sequence,
-                                         real* input_grad,
-                                         size_t num_sequences,
-                                         size_t input_dim,
-                                         size_t context_length,
-                                         int context_start) {
-  CHECK_NOTNULL(out_grad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(input_grad);
-
-  int block_size = 128;
-  int blocks_x = num_sequences;
-  int blocks_y = 1;
-  dim3 threads(block_size, 1);
-  dim3 grid(blocks_x, blocks_y);
-  KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      out_grad, sequence, input_grad, input_dim, context_length, context_start);
-  CHECK_SYNC("hl_context_projection_backward_data failed");
-}
-
-template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-                                                    GpuMatrix& in_grad,
-                                                    const GpuIVector& sequence,
-                                                    size_t context_length,
-                                                    int context_start) {
-  hl_context_projection_backward_data(out_grad.getData(),
-                                      sequence.getData(),
-                                      in_grad.getData(),
-                                      sequence.getSize() - 1,
-                                      in_grad.getWidth(),
-                                      context_length,
-                                      context_start);
-}
-
-template <int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
-                                                  const int* sequence,
-                                                  real* w_grad,
-                                                  int num_sequences,
-                                                  int w_dim,
-                                                  int context_length,
-                                                  int context_start,
-                                                  int begin_pad) {
-  __shared__ real sum_s[THREADS_Y][THREADS_X];
-  int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  int padId = blockIdx.x / pad_of_block;
-  int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
-  int instanceId;
-  real value = 0;
-  real* output_r;
-
-  sum_s[idy][idx] = 0.0f;
-  if (weight_idx < w_dim) {
-    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
-      int seq_start = sequence[seqId];
-      int seq_end = sequence[seqId + 1];
-      output_r =
-          const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
-
-      if (context_start < 0) {
-        if (padId + context_start < 0) {
-          instanceId = padId;
-        } else {
-          // begin_pad > 0;
-          instanceId =
-              (padId - begin_pad) + (seq_end - seq_start) - context_start;
-        }
-      } else {
-        if (padId + (seq_end - seq_start) < context_start) {
-          continue;
-        } else {
-          // begin_pad == 0;
-          instanceId = padId + (seq_end - seq_start) - context_start;
-        }
-      }
-
-      int outx =
-          (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
-      int outy = (instanceId - context_length) < 0
-                     ? 0
-                     : (instanceId - (context_length - 1));
-      output_r += outy * w_dim * context_length + outx * w_dim;
-      for (int j = outy; j < seq_end - seq_start; j++) {
-        value += output_r[weight_idx];
-        if (j - outy == outx) break;
-        output_r += (context_length - 1) * w_dim;
-      }
-    }
-    sum_s[idy][idx] = value;
-  }
-  __syncthreads();
-
-  for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
-    if (idy < stride) {
-      sum_s[idy][idx] += sum_s[idy + stride][idx];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (weight_idx < w_dim) {
-    if (idy == 0) {
-      w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
-    }
-  }
-}
-
-/**
- * @brief   Context projection backward weight.
- *
- * @param[in]   out_grad         output gradient.
- * @param[in]   sequence         sequence index.
- * @param[out]  w_grad           weight gradient.
- * @param[in]   num_sequences    number of sequences.
- * @param[in]   w_dim            input sequence dimension.
- * @param[in]   total_pad        number of extra timesteps.
- * @param[in]   context_length   context length.
- * @param[in]   context_start    context start.
- * @param[in]   begin_pad        number of extra timesteps added at the
- * beginning.
- *
- */
-void hl_context_projection_backward_weight(const real* out_grad,
-                                           const int* sequence,
-                                           real* w_grad,
-                                           size_t num_sequences,
-                                           size_t w_dim,
-                                           size_t total_pad,
-                                           size_t context_length,
-                                           int context_start,
-                                           size_t begin_pad) {
-  CHECK_NOTNULL(out_grad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(w_grad);
-
-  int threads_x = 32;
-  int threads_y = 32;
-  int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
-  dim3 threads(threads_x, threads_y);
-  dim3 grid(blocks_x, 1);
-
-  KeContextProjectionBackwardWeight<32,
-                                    32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      out_grad,
-      sequence,
-      w_grad,
-      num_sequences,
-      w_dim,
-      context_length,
-      context_start,
-      begin_pad);
-  CHECK_SYNC("hl_context_projection_backward_weight failed");
-}
-
-template <>
-void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-                                                      GpuMatrix& w_grad,
-                                                      const GpuIVector& seq_vec,
-                                                      size_t context_length,
-                                                      int context_start,
-                                                      size_t total_pad,
-                                                      size_t begin_pad) {
-  hl_context_projection_backward_weight(out_grad.getData(),
-                                        seq_vec.getData(),
-                                        w_grad.getData(),
-                                        seq_vec.getSize() - 1,
-                                        w_grad.getWidth(),
-                                        total_pad,
-                                        context_length,
-                                        context_start,
-                                        begin_pad);
-}
-
-template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-                                                GpuMatrix& in_grad,
-                                                GpuMatrix& w_grad,
-                                                const GpuIVector& sequence,
-                                                size_t context_length,
-                                                int context_start,
-                                                size_t begin_pad,
-                                                bool is_padding,
-                                                size_t total_pad) {
-  if (in_grad) {
-    ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
-        out_grad, in_grad, sequence, context_length, context_start);
-  }
-  if (is_padding && w_grad) {
-    ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
-                                                     w_grad,
-                                                     sequence,
-                                                     context_length,
-                                                     context_start,
-                                                     total_pad,
-                                                     begin_pad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ContextProjectionOpTest.cpp b/paddle/legacy/function/ContextProjectionOpTest.cpp
deleted file mode 100644
index 3b0a34567..000000000
--- a/paddle/legacy/function/ContextProjectionOpTest.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-void testMatrixProjectionForward(int context_start,
-                                 size_t context_length,
-                                 bool is_padding,
-                                 size_t batch_size,
-                                 size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionForward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start)));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
-  if (is_padding) {  // weight
-    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
-  }
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT,
-                  TensorShape{batch_size, input_dim * context_length}),
-      ADD_TO);
-
-  // run Function
-  test.run();
-}
-
-void testMatrixProjectionBackward(int context_start,
-                                  size_t context_length,
-                                  bool is_padding,
-                                  size_t batch_size,
-                                  size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionBackward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start))
-          .set("is_padding", is_padding)
-          .set("total_pad", pad));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(SequenceArg(
-      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
-      ADD_TO);
-  if (is_padding) {  // weight
-    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
-                    ADD_TO);
-  }
-
-  // run Function
-  test.run();
-}
-
-TEST(ContextProjection, Projection) {
-  for (auto context_start : {-5, -3, -1, 0, 3}) {
-    for (auto context_length : {1, 2, 5, 7}) {
-      for (auto trainable_padding : {false, true}) {
-        for (auto batch_size : {1, 2, 5, 20, 100}) {
-          for (auto input_dim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " context_start=" << context_start
-                    << " context_length=" << context_length
-                    << " trainable_padding=" << trainable_padding
-                    << " batch_size=" << batch_size
-                    << " input_dim=" << input_dim;
-            testMatrixProjectionForward(context_start,
-                                        context_length,
-                                        trainable_padding,
-                                        batch_size,
-                                        input_dim);
-            testMatrixProjectionBackward(context_start,
-                                         context_length,
-                                         trainable_padding,
-                                         batch_size,
-                                         input_dim);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/function/ConvOp.h b/paddle/legacy/function/ConvOp.h
deleted file mode 100644
index 2d8437bcf..000000000
--- a/paddle/legacy/function/ConvOp.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/*
- * \brief Based on the ConvFunctionBase class, the forward calculation,
- *        backward input calculation and backward filter calculation
- *        of convolution operations can be implemented.
- *
- * Arguments of forward and backward calculation:
- *   1. Forward calculation of convolution.
- *      inputs = {INPUT, FILTER}, outputs = {OUTPUT}
- *      The first and second input arguments are input image and filter data.
- *      The output argument is output image.
- *
- *   2. Backward input calculation of convolution.
- *      inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD}
- *      The first and second input arguments are output grad image
- *      and filter data.
- *      The output argument is input grad image.
- *
- *   3. Backward filter calculation of convolution.
- *      inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD}
- *      The first and second input arguments are output grad image
- *      and input image.
- *      The output argument is filter grad.
- *
- * Arguments format of input, filter and output:
- *   1. Input image, output image, input image gradient, output image gradient
- *      are all NCHW format. Where N is batch size, C is the number of channels,
- *      H and W is the height and width of image or image gradient.
- *
- *   2. The format of the filter data is MCHW, where M is the number of output
- *      image channels, C is the number of input image channels,
- *      H and W is height and width of filter.
- *
- *      If `groups` is greater than 1, the filter's data format should be GMCHW,
- *      where G is the `groups`, and G * M is the number of output image
- *      channels, G * C is the number of input image channels,
- *      H and W is height and width of filter.
- */
-class ConvFunctionBase : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    strides_ = config.get<std::vector<size_t>>("strides");
-    paddings_ = config.get<std::vector<size_t>>("paddings");
-    dilations_ = config.get<std::vector<size_t>>("dilations");
-    groups_ = config.get<size_t>("groups");
-
-    // number of inputs and outputs
-    numInputs_ = 2;
-    numOutputs_ = 1;
-  }
-
-  // input can be INPUT and INPUT_GRAD
-  // filter can be FILTER and FILTER_GRAD
-  // output can be OUTPUT and OUTPUT_GRAD
-  void checkShape(const TensorShape& input,
-                  const TensorShape& filter,
-                  const TensorShape& output) {
-    // inputs and outputs arguments should be 4-dimensional.
-    CHECK_EQ(input.ndims(), (size_t)4);
-    CHECK_EQ(output.ndims(), (size_t)4);
-    // The batchSize of the input needs to be equal to
-    // the batchSize of the output.
-    CHECK_EQ(input[0], output[0]);
-
-    if (filter.ndims() == (size_t)4) {
-      // If the filter's dimension is 4, groups convolution is not supported.
-      CHECK_EQ(groups_, (size_t)1);
-      // The input and output channel dimensions are the second and first
-      // dimensions of the filter shape.
-      CHECK_EQ(input[1], filter[1]);
-      CHECK_EQ(output[1], filter[0]);
-    } else {
-      // filter argument should be 5-dimensional.
-      CHECK_EQ(filter.ndims(), (size_t)5);
-      // The first dimension of the filter is the size of the group
-      CHECK_EQ(filter[0], groups_);
-      // The input and output channel dimensions are the third and second
-      // dimensions of the filter shape.
-      CHECK_EQ(input[1], filter[2] * groups_);
-      CHECK_EQ(output[1], filter[1] * groups_);
-    }
-  }
-
- protected:
-  size_t getFilterHeight(const TensorShape& filter) const {
-    return filter[filter.ndims() - 2];
-  }
-
-  size_t getFilterWidth(const TensorShape& filter) const {
-    return filter[filter.ndims() - 1];
-  }
-
-  // determine whether im2col needs to be performed
-  inline bool isNeedIm2col(const TensorShape& filter) const {
-    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
-             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
-             paddingW() == 0);
-  }
-
-  std::vector<size_t> strides_;
-  std::vector<size_t> paddings_;
-  std::vector<size_t> dilations_;
-
-  /// Group size, refer to grouped convolution in
-  /// Alex Krizhevsky's paper: when group=2, the first half of the
-  /// filters are only connected to the first half of the input channels,
-  /// and the second half only connected to the second half.
-  size_t groups_;
-
-  inline int strideH() const { return strides_[0]; }
-
-  inline int strideW() const { return strides_[1]; }
-
-  inline int paddingH() const { return paddings_[0]; }
-
-  inline int paddingW() const { return paddings_[1]; }
-
-  inline int dilationH() const { return dilations_[0]; }
-
-  inline int dilationW() const { return dilations_[1]; }
-
-  // A temporary memory in convolution calculation.
-  MemoryHandlePtr memory_;
-
-  template <DeviceType Device>
-  void resizeBuffer(size_t newSize) {
-    if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) {
-      if (Device == DEVICE_TYPE_CPU) {
-        memory_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
-      } else {
-        memory_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
-      }
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ConvOpTest.h b/paddle/legacy/function/ConvOpTest.h
deleted file mode 100644
index 5eac60897..000000000
--- a/paddle/legacy/function/ConvOpTest.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FunctionTest.h"
-
-namespace paddle {
-
-template <DeviceType DType1, DeviceType DType2>
-void forward(Compare2Function<DType1, DType2>& test,
-             const TensorShape& input,
-             const TensorShape& filter,
-             const TensorShape& output) {
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-  test.run();
-}
-
-template <DeviceType DType1, DeviceType DType2>
-void backward_input(Compare2Function<DType1, DType2>& test,
-                    const TensorShape& input,
-                    const TensorShape& filter,
-                    const TensorShape& output) {
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-  test.run();
-}
-
-template <DeviceType DType1, DeviceType DType2>
-void backward_filter(Compare2Function<DType1, DType2>& test,
-                     const TensorShape& input,
-                     const TensorShape& filter,
-                     const TensorShape& output) {
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter), ADD_TO);
-  test.run();
-}
-
-template <DeviceType DType1, DeviceType DType2>
-using Function = void (*)(Compare2Function<DType1, DType2>& test,
-                          const TensorShape& input,
-                          const TensorShape& filter,
-                          const TensorShape& output);
-
-/**
- * \brief A basic convolution function test interface.
- *
- * \param conv1         type name of convolution function 1.
- * \param conv2         type name of convolution function 2.
- * \param function      test function, can be one of the forward, backward_input
- *                      backward_filter function.
- * Example:
- * 1. Compare GemmConv's CPU and GPU implementation:
- *   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
- *      "GemmConv-CPU", "GemmConv-GPU", forward);
- */
-template <DeviceType DType1, DeviceType DType2>
-void Convolution(const std::string& conv1,
-                 const std::string& conv2,
-                 Function<DType1, DType2> function) {
-  for (size_t batchSize : {1, 5}) {
-    for (size_t inputSize : {7, 14, 31}) {
-      for (size_t filterSize : {1, 3, 5}) {
-        for (size_t inputChannels : {3, 16}) {
-          for (size_t outputChannels : {3, 16}) {
-            if (outputChannels < inputChannels) continue;
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                for (size_t dilation : {1, 3}) {
-                  if (padding >= filterSize) break;
-                  size_t filterS = (filterSize - 1) * dilation + 1;
-
-                  if (inputSize + 2 * padding < filterS) break;
-
-                  if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
-                       conv1 == "NNPACKConv-CPU" ||
-                       conv2 == "NNPACKConv-CPU") &&
-                      dilation > 1)
-                    break;
-
-                  // NNPACK only supports stride = 1 if batchSize > 1
-                  if ((conv1 == "NNPACKConv-CPU" ||
-                       conv2 == "NNPACKConv-CPU") &&
-                      batchSize > 1 && stride > 1)
-                    break;
-
-                  size_t outputSize =
-                      (inputSize - filterS + 2 * padding + stride) / stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputSize
-                          << " inputWidth=" << inputSize
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterSize
-                          << " filterWidth=" << filterSize
-                          << " outputHeight=" << outputSize
-                          << " outputWidth=" << outputSize
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  std::vector<size_t> dilations = {dilation, dilation};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("dilations", dilations)
-                          .set("groups", (size_t)1)
-                          .set("algo", (std::string) "auto"));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterSize, filterSize};
-                  TensorShape output{
-                      batchSize, outputChannels, outputSize, outputSize};
-
-                  function(test, input, filter, output);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief A convolution function test interface for
- *        image height is not equal image width.
- */
-template <DeviceType DType1, DeviceType DType2>
-void Convolution2(const std::string& conv1,
-                  const std::string& conv2,
-                  Function<DType1, DType2> function) {
-  for (size_t batchSize : {4}) {
-    for (size_t inputHeight : {7, 31}) {
-      for (size_t inputWidth : {10, 54}) {
-        for (size_t filterHeight : {1, 5}) {
-          for (size_t filterWidth : {3, 7}) {
-            for (size_t inputChannels : {7}) {
-              for (size_t outputChannels : {7}) {
-                size_t stride = 1;
-                size_t padding = 0;
-                size_t dilation = 1;
-                size_t outputHeight =
-                    (inputHeight - filterHeight + 2 * padding + stride) /
-                    stride;
-                size_t outputWidth =
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputHeight
-                        << " inputWidth=" << inputWidth
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterHeight
-                        << " filterWidth=" << filterWidth
-                        << " outputHeight=" << outputHeight
-                        << " outputWidth=" << outputWidth
-                        << " stride=" << stride << " padding=" << padding;
-
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                std::vector<size_t> dilations = {dilation, dilation};
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", (size_t)1)
-                        .set("dilations", dilations)
-                        .set("algo", (std::string) "auto"));
-
-                TensorShape input{
-                    batchSize, inputChannels, inputHeight, inputWidth};
-                TensorShape filter{
-                    outputChannels, inputChannels, filterHeight, filterWidth};
-                TensorShape output{
-                    batchSize, outputChannels, outputHeight, outputWidth};
-
-                function(test, input, filter, output);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief A convolution function test interface for depthwise convolution.
- */
-template <DeviceType DType1, DeviceType DType2>
-void DepthwiseConvolution(const std::string& conv1,
-                          const std::string& conv2,
-                          Function<DType1, DType2> function) {
-  for (size_t batchSize : {1, 32}) {
-    for (size_t inputSize : {7, 14, 54}) {
-      for (size_t filterSize : {3, 4}) {
-        for (size_t inputChannels : {32}) {
-          for (size_t outputChannels : {32, 64}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                // NNPACK only supports stride = 1 if batchSize > 1,
-                // and there has some bug when batchSize > 1 and groups != 1
-                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
-                    batchSize > 1)
-                  break;
-
-                size_t outputSize =
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputSize
-                        << " inputWidth=" << inputSize
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterSize
-                        << " filterWidth=" << filterSize
-                        << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
-
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                std::vector<size_t> dilations = {1, 1};
-                size_t groups = inputChannels;
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", groups)
-                        .set("dilations", dilations)
-                        .set("algo", (std::string) "auto"));
-
-                TensorShape input{
-                    batchSize, inputChannels, inputSize, inputSize};
-                TensorShape filter{groups,
-                                   outputChannels / groups,
-                                   inputChannels / groups,
-                                   filterSize,
-                                   filterSize};
-                TensorShape output{
-                    batchSize, outputChannels, outputSize, outputSize};
-
-                function(test, input, filter, output);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CosSimOp.cpp b/paddle/legacy/function/CosSimOp.cpp
deleted file mode 100644
index d04f4396c..000000000
--- a/paddle/legacy/function/CosSimOp.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimOp.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-/**
- * Cosine Similarity for CpuMatrix
- *
- * \param out_mat, output value, size: nSamples * 1.
- * \param in1_mat, input value 1, size: nSamples * dim.
- * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale, default 1.0
- *
- */
-template <>
-void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                    const CpuMatrix& in1_mat,
-                                    const CpuMatrix& in2_mat,
-                                    real scale) {
-  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
-  size_t num_samples = out_mat.getHeight();
-  size_t dim = in1_mat.getWidth();
-  /// column vector [nSamples, 1]
-  real* out = out_mat.getData();
-  const real* x = in1_mat.getData();
-  const real* y = in2_mat.getData();
-
-  /// in2 might only have one row or full rows
-  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
-  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += x[j] * x[j];
-      square_sum_y += y[j] * y[j];
-      xy += x[j] * y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-  }
-}
-
-/**
- * Cosine Similarity
- * for each row i,
- *   out[i] = scale * cos(input1[i], input2[i])
- *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
- * when input2 only has one row, then for each row i,
- *   out[i] = cos(input1[i], input2[0])
- *
- * \param inputs[0] input matrix 1, size: nSamples * dim.
- * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param outputs[0] output matrix, size : nSamples * 1.
- */
-
-template <DeviceType Device>
-class CosSimForwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 2UL);
-    CHECK_EQ(outputs.size(), 1UL);
-
-    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], 1UL);
-
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    auto out_mat = outputs[0].matrix<Device>();
-    const auto in1_mat = inputs[0].matrix<Device>();
-    const auto in2_mat = inputs[1].matrix<Device>();
-
-    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
-  }
-
- private:
-  real scale_;
-};
-
-/**
- * Cosine Similarity Derivative for CpuMatrix
- *
- * \param in1_grad  forward input grad 1, size: nSamples * dim.
- * \param in2_grad  forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param out_grad  backward loss output grad, size : nSamples * 1.
- * \param out_val   forward output value, size: nSamples * 1.
- * \param in1_val   forward input value 1, size: nSamples * dim.
- * \param in2_val   forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale,    default 1.0
- */
-template <>
-void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
-                                     const CpuMatrix& out_val,
-                                     const CpuMatrix& in1_val,
-                                     const CpuMatrix& in2_val,
-                                     CpuMatrix& in1_grad,
-                                     CpuMatrix& in2_grad,
-                                     real scale) {
-  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
-        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
-
-  const real* grad = out_grad.getData();
-  const real* out = out_val.getData();
-  const real* prev_out_x = in1_val.getData();
-  const real* prev_out_y = in2_val.getData();
-  real* prev_grad_x = in1_grad.getData();
-  real* prev_grad_y = in2_grad.getData();
-
-  size_t num_samples = out_grad.getHeight();
-  size_t dim = in1_val.getWidth();
-  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
-  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
-  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i,
-              prev_out_x += dim,
-              prev_out_y += inc,
-              prev_grad_x += dim,
-              prev_grad_y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += prev_out_x[j] * prev_out_x[j];
-      square_sum_y += prev_out_y[j] * prev_out_y[j];
-      xy += prev_out_x[j] * prev_out_y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    if (xy == 0) {
-      real reciprocal =
-          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
-        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
-      }
-    } else {
-      real reciprocal_xy = 1.0f / xy;
-      real reciprocal_square_sum_x = 1.0f / square_sum_x;
-      real reciprocal_square_sum_y = 1.0f / square_sum_y;
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] +=
-            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
-                                prev_out_x[j] * reciprocal_square_sum_x);
-        prev_grad_y[j] +=
-            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
-                                prev_out_y[j] * reciprocal_square_sum_y);
-      }
-    }
-  }
-}
-
-/**
- * Cosine Similarity backward Derivative
- *
- * \param outputs[0] forward input grad 1, size: nSamples * dim.
- * \param outputs[1] forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param inputs[0] backward loss output grad, size : nSamples * 1.
- * \param inputs[1] forward output value, size: nSamples * 1.
- * \param inputs[2] forward input value 1, size: nSamples * dim.
- * \param inputs[3] forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- */
-template <DeviceType Device>
-class CosSimBackwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 4UL);
-    CHECK_EQ(outputs.size(), 2UL);
-    /// dim of out_grad and out_val == 1, column vector
-    CHECK_EQ(inputs[0].shape()[1], 1UL);
-    CHECK_EQ(inputs[1].shape()[1], 1UL);
-    /// nSamples of out_grad == out_val == in_val1 == in_grad1
-    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
-    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
-    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
-
-    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
-          inputs[3].data() && outputs[0].data() && outputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-
-    const auto out_grad = inputs[0].matrix<Device>();
-    const auto out_val = inputs[1].matrix<Device>();
-    const auto in1_val = inputs[2].matrix<Device>();
-    const auto in2_val = inputs[3].matrix<Device>();
-    auto in1_grad = outputs[0].matrix<Device>();
-    auto in2_grad = outputs[1].matrix<Device>();
-
-    CosSimBackward<Device>(
-        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
-  }
-
- private:
-  real scale_;
-};
-
-REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/function/CosSimOp.h b/paddle/legacy/function/CosSimOp.h
deleted file mode 100644
index 2d377eb3b..000000000
--- a/paddle/legacy/function/CosSimOp.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief   Cosine Similarity Forward.
- * for each row i,
- * out[i] = scale * cos(in1[i], in2[i])
- *        = scale * \sum_j (in1[i][j] * in2[i][j]) /
- *                  sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
- *
- * \param[out]  output            output value.
- * \param[in]   intput1           input value.
- * \param[in]   intput2           input value.
- * \param[in]   scale             default 1.0.
- *
- */
-template <DeviceType Device>
-void CosSimForward(typename Tensor<real, Device>::Matrix& output,
-                   const typename Tensor<real, Device>::Matrix& input1,
-                   const typename Tensor<real, Device>::Matrix& input2,
-                   real scale);
-
-/**
- * \brief   Cosine Similarity BackWard for Derivative.
- *
- * \param[in]       output grad           backward loss output grad.
- * \param[in]       output val            forward-output value.
- * \param[in]       input val1            forward input value 1.
- * \param[in]       input val2            forward input value 2.
- * \param[in/out]   input grad            forward input grad 1.
- * \param[in/out]   input grad            forward input grad 2.
- * \param[in]       scale                 default 1.0.
- *
- */
-template <DeviceType Device>
-void CosSimBackward(const typename Tensor<real, Device>::Matrix& out_grad,
-                    const typename Tensor<real, Device>::Matrix& out_value,
-                    const typename Tensor<real, Device>::Matrix& in1_value,
-                    const typename Tensor<real, Device>::Matrix& in2_value,
-                    typename Tensor<real, Device>::Matrix& in1_grad,
-                    typename Tensor<real, Device>::Matrix& in2_grad,
-                    real scale);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CosSimOpGpu.cu b/paddle/legacy/function/CosSimOpGpu.cu
deleted file mode 100644
index 9fe50529a..000000000
--- a/paddle/legacy/function/CosSimOpGpu.cu
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimOp.h"
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-
-namespace paddle {
-
-template <int block_size>
-__global__ void KeCosSim(real* output,
-                         const real* input1,
-                         const real* input2,
-                         int width,
-                         int input1_height,
-                         int input2_height,
-                         real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[block_size];
-  __shared__ real yy[block_size];
-  __shared__ real xy[block_size];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  input1 += ty * width;
-  if (input2_height > 1) {
-    input2 += ty * width;
-  }
-  for (int index = tid; index < width; index += block_size) {
-    real x = input1[index];
-    real y = input2[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = block_size / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
-  }
-}
-
-void hlCossim(real* output,
-              const real* input1,
-              const real* input2,
-              size_t width,
-              size_t input1_height,
-              size_t input2_height,
-              real scale) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input1);
-  CHECK_NOTNULL(input2);
-  const int block_size = 256;
-  dim3 threads(block_size, 1);
-  dim3 grid(1, input1_height);
-
-  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, input1, input2, width, input1_height, input2_height, scale);
-  CHECK_SYNC("hlCossim failed");
-}
-
-template <>
-void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
-                                    const GpuMatrix& in1_mat,
-                                    const GpuMatrix& in2_mat,
-                                    real scale) {
-  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
-  CHECK(in1_mat.useGpu_ == true && in2_mat.useGpu_ == true)
-      << "Matrix type are not GPU";
-
-  size_t dim = in1_mat.getWidth();
-  real* out = out_mat.getData();
-  const real* x = in1_mat.getData();
-  const real* y = in2_mat.getData();
-  hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
-}
-
-template <int block_size>
-__global__ void KeCosSimDerivative(const real* grad,
-                                   const real* output,
-                                   const real* prev_out_x,
-                                   const real* prev_out_y,
-                                   real* prev_grad_x,
-                                   real* prev_grad_y,
-                                   size_t width,
-                                   size_t input1_height,
-                                   size_t input2_height,
-                                   real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[block_size];
-  __shared__ real yy[block_size];
-  __shared__ real xy[block_size];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  prev_out_x += ty * width;
-  prev_grad_x += ty * width;
-  if (input2_height > 1) {
-    prev_out_y += ty * width;
-    prev_grad_y += ty * width;
-  }
-  for (int index = tid; index < width; index += block_size) {
-    real x = prev_out_x[index];
-    real y = prev_out_y[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = block_size / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (xy[0] == 0) {
-    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
-    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
-      if (input2_height > 1) {
-        prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
-      } else {
-        paddle::paddleAtomicAdd(
-            prev_grad_y + index,
-            scale * grad[ty] * prev_out_x[index] * reciprocal);
-      }
-    }
-  } else {
-    real reciprocalXY = 1.0 / xy[0];
-    real reciprocalSquareSumX = 1.0 / xx[0];
-    real reciprocalSquareSumY = 1.0 / yy[0];
-    for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] +=
-          output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
-                                   prev_out_x[index] * reciprocalSquareSumX);
-      if (input2_height > 1) {
-        prev_grad_y[index] +=
-            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
-                                     prev_out_y[index] * reciprocalSquareSumY);
-      } else {
-        paddle::paddleAtomicAdd(
-            prev_grad_y + index,
-            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
-                                     prev_out_y[index] * reciprocalSquareSumY));
-      }
-    }
-  }
-}
-
-void hlCossimDerivative(const real* grad,
-                        const real* output,
-                        const real* prev_out_x,
-                        const real* prev_out_y,
-                        real* prev_grad_x,
-                        real* prev_grad_y,
-                        size_t width,
-                        size_t input1_height,
-                        size_t input2_height,
-                        real scale) {
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(prev_out_x);
-  CHECK_NOTNULL(prev_out_y);
-  CHECK_NOTNULL(prev_grad_x);
-  CHECK_NOTNULL(prev_grad_y);
-  const int block_size = 256;
-  dim3 threads(block_size, 1);
-  dim3 grid(1, input1_height);
-  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad,
-      output,
-      prev_out_x,
-      prev_out_y,
-      prev_grad_x,
-      prev_grad_y,
-      width,
-      input1_height,
-      input2_height,
-      scale);
-  CHECK_SYNC("hlCossimDerivate failed");
-}
-
-template <>
-void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
-                                     const GpuMatrix& out_val,
-                                     const GpuMatrix& in1_val,
-                                     const GpuMatrix& in2_val,
-                                     GpuMatrix& in1_grad,
-                                     GpuMatrix& in2_grad,
-                                     real scale) {
-  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
-        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
-        in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
-      << "Matrix types are not equally GPU";
-
-  size_t dim = in1_val.getWidth();
-  const real* grad = out_grad.getData();
-  const real* out = out_val.getData();
-  const real* prev_out_x = in1_val.getData();
-  const real* prev_out_y = in2_val.getData();
-  real* prev_grad_x = in1_grad.getData();
-  real* prev_grad_y = in2_grad.getData();
-  hlCossimDerivative(grad,
-                     out,
-                     prev_out_x,
-                     prev_out_y,
-                     prev_grad_x,
-                     prev_grad_y,
-                     dim,
-                     in1_val.getHeight(),
-                     in2_val.getHeight(),
-                     scale);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CosSimOpTest.cpp b/paddle/legacy/function/CosSimOpTest.cpp
deleted file mode 100644
index 31bb43e1b..000000000
--- a/paddle/legacy/function/CosSimOpTest.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/legacy/math/Matrix.h"
-
-using namespace paddle;  // NOLINT
-
-void testCosSimForward(size_t height_x,
-                       size_t height_y,
-                       size_t width,
-                       real scale) {
-  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
-                  ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-void testCosSimBackward(size_t height_x,
-                        size_t height_y,
-                        size_t width,
-                        real scale) {
-  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
-                  ADD_TO);
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
-                  ADD_TO);
-  // run Function
-  test.run();
-}
-
-TEST(Matrix, cosSim) {
-  for (auto height_x : {10, 100, 1000}) {
-    for (auto height_y : {1, height_x}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimForward(height_x, height_y, width, scale);
-          testCosSimBackward(height_x, height_y, width, scale);
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/function/CropOp.cpp b/paddle/legacy/function/CropOp.cpp
deleted file mode 100644
index e22678822..000000000
--- a/paddle/legacy/function/CropOp.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropOp.h"
-#include "paddle/legacy/function/TensorShape.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Crop<DEVICE_TYPE_CPU>(real* outputs,
-                           const real* inputs,
-                           const TensorShape inShape,
-                           const TensorShape outShape,
-                           const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = inShape[0];
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < outC; c++) {
-      for (int h = 0; h < outH; h++) {
-        int outoff = ((n * outC + c) * outH + h) * outW;
-        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
-        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                               real* outGrad,
-                               const TensorShape inShape,
-                               const TensorShape outShape,
-                               const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = outShape[0];
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
-        int inoff = ((n * inC + c) * inH + h) * inW;
-        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
-        CpuVector outG = CpuVector(inW, outGrad + outoff);
-        outG += inG;
-      }
-    }
-  }
-}
-
-/**
- * \brief Crop input according to the specify corner and shape.
- *        The input and output is a 4D tensor. In CropFunc, we only
- *        crop the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the cropping corner and shape.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after cropping.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- *
- * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
- * Output(2,2,1,2) = [
- *                    [ [[4,5]],
- *                      [[6,7]] ],
- *                    [ [[8,7]],
- *                      [[3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- */
-template <DeviceType Device>
-class CropFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape inShape = inputs[0].shape();
-    TensorShape outShape = outputs[0].shape();
-
-    Crop<Device>(outputs[0].data<real>(),
-                 inputs[0].data<real>(),
-                 inShape,
-                 outShape,
-                 conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of cropping Function.
- *
- * Argument in this Function:
- * \param crop_    The same meaning as it in CropFunc.
- * \param inputs  The gradient with respect to the output value of CropFunc.
- * \param outputs The gradient with respect to the input value of CropFunc.
- */
-
-template <DeviceType Device>
-class CropGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape outShape = outputs[0].shape();
-    TensorShape inShape = inputs[0].shape();
-
-    CropGrad<Device>(inputs[0].data<real>(),
-                     outputs[0].data<real>(),
-                     inShape,
-                     outShape,
-                     conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CropOp.h b/paddle/legacy/function/CropOp.h
deleted file mode 100644
index 05d4b163b..000000000
--- a/paddle/legacy/function/CropOp.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief  This funtion crops inputs according to the specify start point and
- *shape.
- *
- * \param[out] outputs	save results.
- * \param[in]  inputs	input data.
- * \param[in]  inShape  the shape of input tensor.
- * \param[in]  conf     the cropping config
- */
-template <DeviceType Device>
-void Crop(real* outputs,
-          const real* inputs,
-          const TensorShape inShape,
-          const TensorShape outShape,
-          const FuncConfig& conf);
-
-/**
- * \brief   Cropping operation backward.
- *
- * \param[out] inGrad	gradients of previous layer
- * \param[in]  outGrad  output gradient
- * \param[in]  inShape  the shape of input tensor.
- * \param[in]  conf     the cropping config
- */
-template <DeviceType Device>
-void CropGrad(const real* inGrad,
-              real* outGrad,
-              const TensorShape inShape,
-              const TensorShape outShape,
-              const FuncConfig& conf);
-}  // namespace paddle
diff --git a/paddle/legacy/function/CropOpGpu.cu b/paddle/legacy/function/CropOpGpu.cu
deleted file mode 100644
index 561506243..000000000
--- a/paddle/legacy/function/CropOpGpu.cu
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KeCrop(real* outputs,
-                       const real* inputs,
-                       int inC,
-                       int inH,
-                       int inW,
-                       int cropC,
-                       int cropH,
-                       int cropW,
-                       int outC,
-                       int outH,
-                       int outW,
-                       int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % outW;
-    const int h = (idx / outW) % outH;
-    const int c = (idx / outW / outH) % outC;
-    const int n = idx / outW / outH / outC;
-
-    const int off = ((n * inC + c + cropC) * inH + h + cropH) * inW + cropW + w;
-    outputs[idx] = inputs[off];
-  }
-}
-
-template <>
-void Crop<DEVICE_TYPE_GPU>(real* outputs,
-                           const real* inputs,
-                           const TensorShape inShape,
-                           const TensorShape outShape,
-                           const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cropC = crop_corner[1];
-  int cropH = crop_corner[2];
-  int cropW = crop_corner[3];
-
-  int num = inShape[0];
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  size_t nth = num * outC * outH * outW;
-  int blockSize = 1024;
-  int gridSize = (nth + blockSize - 1) / blockSize;
-
-  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
-                                                     inputs,
-                                                     inC,
-                                                     inH,
-                                                     inW,
-                                                     cropC,
-                                                     cropH,
-                                                     cropW,
-                                                     outC,
-                                                     outH,
-                                                     outW,
-                                                     nth);
-  CHECK_SYNC("Crop");
-}
-
-__global__ void KeCropDiff(const real* inGrad,
-                           real* outGrad,
-                           int inC,
-                           int inH,
-                           int inW,
-                           int cropC,
-                           int cropH,
-                           int cropW,
-                           int outC,
-                           int outH,
-                           int outW,
-                           int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % inW;
-    const int h = (idx / inW) % inH;
-    const int c = (idx / inW / inH) % inC;
-    const int n = idx / inW / inH / inC;
-
-    const int off =
-        ((n * outC + c + cropC) * outH + h + cropH) * outW + cropW + w;
-
-    outGrad[off] += inGrad[idx];
-  }
-}
-
-template <>
-void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
-                               real* outGrad,
-                               const TensorShape inShape,
-                               const TensorShape outShape,
-                               const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cropC = crop_corner[1];
-  int cropH = crop_corner[2];
-  int cropW = crop_corner[3];
-
-  int num = outShape[0];
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  size_t nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + blockSize - 1) / blockSize;
-
-  KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
-                                                         outGrad,
-                                                         inC,
-                                                         inH,
-                                                         inW,
-                                                         cropC,
-                                                         cropH,
-                                                         cropW,
-                                                         outC,
-                                                         outH,
-                                                         outW,
-                                                         nth);
-  CHECK_SYNC("CropGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CropOpTest.cpp b/paddle/legacy/function/CropOpTest.cpp
deleted file mode 100644
index 10c83a032..000000000
--- a/paddle/legacy/function/CropOpTest.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(Crop, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {5, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          for (bool test_grad : {false, true}) {
-            CpuGpuFuncCompare compare(
-                test_grad ? "CropGrad" : "Crop",
-                FuncConfig()
-                    .set<std::vector<uint32_t>>("crop_corner", {0, 1, 1, 1})
-                    .set<std::vector<uint32_t>>("crop_shape", {0, 2, 3, 3}));
-            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-            TensorShape outDims{numSamples, 2, 3, 3};
-            compare.addInputs(
-                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
-            compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT,
-                                         test_grad ? inDims : outDims,
-                                         test_grad ? ADD_TO : ASSIGN_TO),
-                               test_grad ? ADD_TO : ASSIGN_TO);
-            compare.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CrossMapNormalOp.cpp b/paddle/legacy/function/CrossMapNormalOp.cpp
deleted file mode 100644
index f28703af0..000000000
--- a/paddle/legacy/function/CrossMapNormalOp.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CrossMapNormalOp.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
-                                     real* denoms,
-                                     const real* inputs,
-                                     size_t numSamples,
-                                     size_t channels,
-                                     size_t height,
-                                     size_t width,
-                                     size_t size,
-                                     real scale,
-                                     real pow) {
-  size_t oneImage = height * width;
-  size_t oneSample = channels * oneImage;
-
-  CpuVector outputsV(numSamples * oneSample, outputs);
-  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
-  CpuVector denomsV(numSamples * oneSample, denoms);
-
-  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
-  // x represents inputs
-  // f(x) represents outputs
-  // denoms save the intermediate result for backward
-  denomsV = denomsV.constant(1.0);
-  const int start = -((int)size - 1) / 2;
-  const int end = (int)size + start;
-  for (size_t i = 0; i < numSamples; i++) {
-    real* oneDenom = denoms + i * oneSample;
-    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
-    for (int c = 0; c < (int)channels; c++) {
-      CpuVector denom(oneImage, oneDenom + c * oneImage);
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
-          denom += input.square() * scale;
-        }
-      }
-    }
-  }
-
-  outputsV = inputsV * denomsV.pow(-pow);
-}
-
-template <>
-void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
-                                         const real* inputsValue,
-                                         const real* outputsValue,
-                                         const real* outputsGrad,
-                                         const real* denoms,
-                                         size_t numSamples,
-                                         size_t channels,
-                                         size_t height,
-                                         size_t width,
-                                         size_t size,
-                                         real scale,
-                                         real pow) {
-  size_t oneSample = channels * height * width;
-  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
-                                                         size_t offset) {
-    return CpuVector(height * width, data + offset);
-  };
-
-  const int start = -((int)size) / 2;
-  const int end = (int)size + start;
-  const real ratio = -(real)2 * scale * pow;
-  for (size_t i = 0; i < numSamples; i++) {
-    size_t sOffset = i * oneSample;
-    real* oneInputGrad = inputsGrad + sOffset;
-    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
-    real* oneDenom = const_cast<real*>(denoms) + sOffset;
-    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
-    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
-
-    for (int c = 0; c < (int)channels; c++) {
-      size_t cOffset = c * height * width;
-      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
-      CpuVector inputValue = oneImage(oneInputValue, cOffset);
-      CpuVector denom = oneImage(oneDenom, cOffset);
-      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
-
-      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          size_t offset = (c + s) * height * width;
-          CpuVector output = oneImage(oneOutputValue, offset);
-          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
-          CpuVector denom = oneImage(oneDenom, offset);
-
-          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief Normalization with across maps.
- *
- * This Function comes from the paper
- * "ImageNet Classification with Deep Convolutional Neural Networks".
- *
- * The original formula is:
- *
- *                                Input(i, x, y)
- * Output(i, x, y) = ----------------------------------------------
- *                                 -- upper
- *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
- *                                 -- j = lower
- *
- * upper is `min(C, c + N/2)`
- * lower if `max(0, c - N/2)`
- *
- * Function implementation:
- *
- * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
- * And the meaning of each dimension(0-3) is respectively batch size,
- * feature maps, rows and columns.
- *
- * Input and Output in the above formula is for each map(i) of one image, and
- * Input(i, x, y), Output(i, x, y) represents an element in an image.
- *
- * C is the number of feature maps of one image, and N is a hyper-parameters
- * is configured when Function is initialized. The sum in the denominator
- * is the sum of the same position in the neighboring maps.
- *
- * In the implementation of Function, k is equal to 1,
- * so Function has no argument for k.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent Input
- * \param outputs[0] represent Output
- * \param outputs[1] represent The denominator in the formula(except beta)
- *
- * Note:
- * Save output[1] is to simplify the backward calculation.
- * TODO, if only consider the forward calculation, we can optimize to
- * remove the output[1].
- */
-template <DeviceType Device>
-class CrossMapNormalFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 1;
-    numOutputs_ = 2;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    // ArgType check still on here,
-    // not sure whether it is better to put inside the check.
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormal<Device>(outputs[0].data<real>(),
-                           outputs[1].data<real>(),
-                           inputs[0].data<real>(),
-                           batchSize,
-                           maps,
-                           rows,
-                           columns,
-                           size_,
-                           scale_,
-                           pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == outputs[0].shape());
-    CHECK(inputs[0].shape() == outputs[1].shape());
-  }
-
-  // Only need the shape of the input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)numInputs_, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
-
-    return ops;
-  }
-
- private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-/**
- * \brief Backward calculation for normalization with across maps.
- *
- * Function implementation:
- *
- * The implementation of this Function is derived from the
- * CrossMapNormalFunc implementation.
- *
- * InputGrad = OutputGrad * denoms ^ (-beta)
- *    -- upper
- *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
- *    -- lower
- *
- * The data of inputs/outputs format is the same as the forward interface
- * and is NCHW.
- *
- * The upper and lower is the same as forward. The logic of the sum
- * is also the same as forward.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
- * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
- * \param inputs[2]  represent OutputGrad
- * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
- *                   This is the intermediate result that is
- *                   preserved in the forward calculation.
- * \param outputs[0] represent InputGrad
- */
-template <DeviceType Device>
-class CrossMapNormalGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 4;
-    numOutputs_ = 1;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    if (outputs[0].getArgType() != ADD_TO) {
-      // Currently, some algorithm implementations are ASSIGN_TO mode,
-      // if need to support the ADD_TO calculation, need to clear the output.
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
-                               inputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               inputs[2].data<real>(),
-                               inputs[3].data<real>(),
-                               batchSize,
-                               maps,
-                               rows,
-                               columns,
-                               size_,
-                               scale_,
-                               pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == inputs[1].shape());
-    CHECK(inputs[0].shape() == inputs[2].shape());
-    CHECK(inputs[0].shape() == inputs[3].shape());
-    CHECK(inputs[0].shape() == outputs[0].shape());
-  }
-
-  // Only need the shape of one input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_LT((size_t)1, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
-
-    return ops;
-  }
-
- private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CrossMapNormalOp.h b/paddle/legacy/function/CrossMapNormalOp.h
deleted file mode 100644
index bb9cdf202..000000000
--- a/paddle/legacy/function/CrossMapNormalOp.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief   Cross map respose normalize forward.
- *          The data structure of image data is NCHW.
- *
- * \param[out]  outputs     output data.
- * \param[in]   denoms      denoms buffer.
- * \param[in]   inputs      input data.
- * \param[in]   numSamples  batch size of input image.
- * \param[in]   channels    number of channel.
- * \param[in]   height      image height.
- * \param[in]   width       image width.
- * \param[in]   size        size.
- * \param[in]   scale       scale.
- * \param[in]   pow         scale.
- *
- */
-template <DeviceType Device>
-void CrossMapNormal(real* outputs,
-                    real* denoms,
-                    const real* inputs,
-                    size_t numSamples,
-                    size_t channels,
-                    size_t height,
-                    size_t width,
-                    size_t size,
-                    real scale,
-                    real pow);
-
-/**
- * \brief   Cross map respose normalize backward.
- *          The data structure of image data is NCHW.
- *
- * \param[out]  inputsGrad      input grad.
- * \param[in]   inputsValue     input value.
- * \param[out]  outputsValue    output value.
- * \param[out]  outputsGrad     output grad.
- * \param[in]   denoms          denoms buffer.
- * \param[in]   numSamples      batch size of input image.
- * \param[in]   channels        number of channel.
- * \param[in]   height          image height.
- * \param[in]   width           image width.
- * \param[in]   size            size.
- * \param[in]   scale           scale.
- * \param[in]   pow             scale.
- *
- */
-template <DeviceType Device>
-void CrossMapNormalGrad(real* inputsGrad,
-                        const real* inputsValue,
-                        const real* outputsValue,
-                        const real* outputsGrad,
-                        const real* denoms,
-                        size_t numSamples,
-                        size_t channels,
-                        size_t height,
-                        size_t width,
-                        size_t size,
-                        real scale,
-                        real pow);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CrossMapNormalOpGpu.cu b/paddle/legacy/function/CrossMapNormalOpGpu.cu
deleted file mode 100644
index 938827610..000000000
--- a/paddle/legacy/function/CrossMapNormalOpGpu.cu
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CrossMapNormalOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KeCMRNormFillScale(size_t imageSize,
-                                   const real* in,
-                                   real* scale,
-                                   size_t channels,
-                                   size_t height,
-                                   size_t width,
-                                   size_t size,
-                                   real alpha) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < imageSize) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int n = idx / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-
-    in += offset;
-    scale += offset;
-    const int step = height * width;
-    const int pre_pad = (size - 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    real accum = 0;
-    int index = 0;
-    while (index < channels + post_pad) {
-      if (index < channels) {
-        accum += in[index * step] * in[index * step];
-      }
-      if (index >= size) {
-        accum -= in[(index - size) * step] * in[(index - size) * step];
-      }
-      if (index >= post_pad) {
-        scale[(index - post_pad) * step] = 1. + accum * alpha;
-      }
-      ++index;
-    }
-  }
-}
-
-__global__ void KeCMRNormOutput(size_t inputSize,
-                                const real* in,
-                                const real* scale,
-                                real negative_beta,
-                                real* out) {
-  const int index = threadIdx.x + blockIdx.x * blockDim.x;
-  if (index < inputSize) {
-    out[index] = in[index] * pow(scale[index], negative_beta);
-  }
-}
-
-template <>
-void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
-                                     real* denoms,
-                                     const real* inputs,
-                                     size_t numSamples,
-                                     size_t channels,
-                                     size_t height,
-                                     size_t width,
-                                     size_t size,
-                                     real scale,
-                                     real pow) {
-  size_t imageSize = numSamples * height * width;
-  int blockSize = 1024;
-  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      imageSize, inputs, denoms, channels, height, width, size, scale);
-
-  size_t inputSize = numSamples * height * width * channels;
-  blockSize = 1024;
-  gridSize = (inputSize + 1024 - 1) / 1024;
-  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      inputSize, inputs, denoms, -pow, outputs);
-
-  CHECK_SYNC("CrossMapNormal");
-}
-
-__global__ void KeCMRNormDiff(size_t imageSize,
-                              const real* bottom_data,
-                              const real* top_data,
-                              const real* scale,
-                              const real* top_diff,
-                              size_t channels,
-                              size_t height,
-                              size_t width,
-                              size_t size,
-                              real negative_beta,
-                              real cache_ratio,
-                              real* bottom_diff) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < imageSize) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int n = idx / width / height;
-    const int offset = (n * channels * height + h) * width + w;
-    bottom_data += offset;
-    top_data += offset;
-    scale += offset;
-    top_diff += offset;
-    bottom_diff += offset;
-
-    const int step = height * width;
-    const int pre_pad = size - (size + 1) / 2;
-    const int post_pad = size - pre_pad - 1;
-
-    int index = 0;
-    real accum = 0;
-    while (index < channels + post_pad) {
-      if (index < channels) {
-        accum += top_diff[index * step] * top_data[index * step] /
-                 scale[index * step];
-      }
-      if (index >= size) {
-        accum -= top_diff[(index - size) * step] *
-                 top_data[(index - size) * step] / scale[(index - size) * step];
-      }
-      if (index >= post_pad) {
-        bottom_diff[(index - post_pad) * step] +=
-            top_diff[(index - post_pad) * step] *
-                pow(scale[(index - post_pad) * step], negative_beta) -
-            cache_ratio * bottom_data[(index - post_pad) * step] * accum;
-      }
-      ++index;
-    }
-  }
-}
-
-template <>
-void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
-                                         const real* inputsValue,
-                                         const real* outputsValue,
-                                         const real* outputsGrad,
-                                         const real* denoms,
-                                         size_t numSamples,
-                                         size_t channels,
-                                         size_t height,
-                                         size_t width,
-                                         size_t size,
-                                         real scale,
-                                         real pow) {
-  size_t imageSize = numSamples * height * width;
-
-  int blockSize = 1024;
-  int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
-                                                            inputsValue,
-                                                            outputsValue,
-                                                            denoms,
-                                                            outputsGrad,
-                                                            channels,
-                                                            height,
-                                                            width,
-                                                            size,
-                                                            -pow,
-                                                            2.0f * pow * scale,
-                                                            inputsGrad);
-  CHECK_SYNC("CrossMapNormalGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/CrossMapNormalOpTest.cpp b/paddle/legacy/function/CrossMapNormalOpTest.cpp
deleted file mode 100644
index dec52adde..000000000
--- a/paddle/legacy/function/CrossMapNormalOpTest.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(CrossMapNormal, real) {
-  for (size_t numSamples : {5}) {
-    for (size_t channels : {1, 5}) {
-      for (size_t imgSizeH : {5, 33}) {
-        for (size_t imgSizeW : {5, 32}) {
-          for (size_t size : {1, 3}) {
-            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
-                    << " size=" << size;
-
-            // init Test object
-            CpuGpuFuncCompare test("CrossMapNormal",
-                                   FuncConfig()
-                                       .set("size", size)
-                                       .set("scale", (real)1.5)
-                                       .set("pow", (real)0.5));
-            // prepare input arguments
-            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            // run Function
-            test.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(CrossMapNormalGrad, real) {
-  for (size_t numSamples : {5}) {
-    for (size_t channels : {1, 5}) {
-      for (size_t imgSizeH : {5, 33}) {
-        for (size_t imgSizeW : {5, 32}) {
-          for (size_t size : {1, 3}) {
-            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
-                    << " size=" << size;
-
-            CpuGpuFuncCompare test("CrossMapNormalGrad",
-                                   FuncConfig()
-                                       .set("size", size)
-                                       .set("scale", (real)1.5)
-                                       .set("pow", (real)0.5));
-            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-            // run Function
-            test.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOp.cpp b/paddle/legacy/function/DepthwiseConvOp.cpp
deleted file mode 100644
index 958034e08..000000000
--- a/paddle/legacy/function/DepthwiseConvOp.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DepthwiseConvOp.h"
-#include "ConvOp.h"
-
-namespace paddle {
-
-template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData) {
-    // TODO(zhaolong) : cpu implementation of depthwise convolution
-  }
-};
-
-template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad) {}
-  // TODO(zhaolong) : cpu implementation of depthwise convolution
-};
-
-template <class T>
-class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad) {}
-  // TODO(zhaolong) : cpu implementation of depthwise convolution
-};
-
-/*
- * \brief Forward calculation of depthwise convolution.
- */
-template <DeviceType Device>
-class DepthwiseConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-    size_t filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-
-    DepthwiseConvFunctor<Device, real> depthwiseConv;
-    depthwiseConv(inputData,
-                  filterData,
-                  batchSize,
-                  outputChannels,
-                  outputHeight,
-                  outputWidth,
-                  inputChannels,
-                  inputHeight,
-                  inputWidth,
-                  filterMultiplier,
-                  filterHeight,
-                  filterWidth,
-                  strideH(),
-                  strideW(),
-                  paddingH(),
-                  paddingW(),
-                  outputData);
-  }
-};
-
-/*
- * \brief Backward input calculation of depthwise convolution.
- */
-template <DeviceType Device>
-class DepthwiseConvGradInputFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    check(inputs, outputs);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-    size_t filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    real* outputGrad = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* inputGrad = outputs[0].data<real>();
-
-    DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
-    depthwiseConvGradInput(outputGrad,
-                           filterData,
-                           batchSize,
-                           outputChannels,
-                           outputHeight,
-                           outputWidth,
-                           inputChannels,
-                           inputHeight,
-                           inputWidth,
-                           filterMultiplier,
-                           filterHeight,
-                           filterWidth,
-                           strideH(),
-                           strideW(),
-                           paddingH(),
-                           paddingW(),
-                           inputGrad);
-  }
-};
-
-/*
- * \brief Backward filter calculation of depthwise convolution.
- */
-template <DeviceType Device>
-class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    check(inputs, outputs);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-    size_t filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    real* outputGrad = inputs[0].data<real>();
-    real* inputData = inputs[1].data<real>();
-    real* filterGrad = outputs[0].data<real>();
-
-    int size = outputChannels * filterHeight * filterWidth * outputHeight *
-               outputWidth;
-    resizeBuffer<Device>(size);
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
-
-    DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
-
-    depthwiseConvGradFilter(outputGrad,
-                            inputData,
-                            batchSize,
-                            outputChannels,
-                            outputHeight,
-                            outputWidth,
-                            inputChannels,
-                            inputHeight,
-                            inputWidth,
-                            filterMultiplier,
-                            filterHeight,
-                            filterWidth,
-                            strideH(),
-                            strideW(),
-                            paddingH(),
-                            paddingW(),
-                            colData,
-                            filterGrad);
-  }
-};
-
-REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
-                    CPU,
-                    DepthwiseConvGradInputFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
-                    CPU,
-                    DepthwiseConvGradFilterFunction);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
-                    GPU,
-                    DepthwiseConvGradInputFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
-                    GPU,
-                    DepthwiseConvGradFilterFunction);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOp.h b/paddle/legacy/function/DepthwiseConvOp.h
deleted file mode 100644
index 7837edd1c..000000000
--- a/paddle/legacy/function/DepthwiseConvOp.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TensorType.h"
-
-namespace paddle {
-
-/**
- *\brief   Depthwise convolution forward. The outputData
- *         of depthwise convolution is same with ExpandConvLayer
- *         when groups equals inputChannels in ExpandConvLayer.
- *
- * \param[in]   inputData         input data.
- * \param[in]   filterData        the Paramters of the depthwise conv layer..
- * \param[in]   batchSize         batch size of input data.
- * \param[in]   outputChannels    channels of outputData.
- * \param[in]   outputHeight      height of outputData.
- * \param[in]   outputWidth       width of outputData.
- * \param[in]   inputChannels     channels of inputData.
- * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData..
- * \param[in]   filterMultiplier  equals to outputChannels/groups_.
- * \param[in]   filterHeight      height of filter.
- * \param[in]   filterWidth       widht of filter.
- * \param[in]   strideH           stride size in height direction.
- * \param[in]   strideW           stride size in width direction.
- * \param[in]   paddingH          padding size in height direction.
- * \param[in]   paddingW          padding size in width direction.
- * \param[out]  outputData        outputData.
- *
- */
-template <DeviceType Device, class T>
-class DepthwiseConvFunctor {
- public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData);
-};
-
-/**
- *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
- *
- *
- * \param[in]   outputGradData    the grad data of output.
- * \param[in]   filterData        the Paramters of the depthwise conv layer..
- * \param[in]   batchSize         batch size of input data.
- * \param[in]   outputChannels    channels of outputData.
- * \param[in]   outputHeight      height of outputData.
- * \param[in]   outputWidth       width of outputData.
- * \param[in]   inputChannels     channels of input data.
- * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData.
- * \param[in]   filterMultiplier  equals to outputChannels/groups_.
- * \param[in]   filterHeight      height of filter.
- * \param[in]   filterWidth       widht of filter.
- * \param[in]   strideH           stride size in height direction.
- * \param[in]   strideW           stride size in width direction.
- * \param[in]   paddingH          padding size in height direction.
- * \param[in]   paddingW          padding size in width direction.
- * \param[out]  inputGrad         the grad data of input.
- *
- */
-template <DeviceType Device, class T>
-class DepthwiseConvGradInputFunctor {
- public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad);
-};
-
-/**
- *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
- *
- * \param[in]   outputGradData    the grad data of output.
- * \param[in]   inputData         inputData.
- * \param[in]   batchSize         batch size of input data.
- * \param[in]   outputChannels    channels of outputData.
- * \param[in]   outputHeight      height of outputData.
- * \param[in]   outputWidth       width of outputData.
- * \param[in]   inputChannels     channels of input data.
- * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData.
- * \param[in]   filterMultiplier  equals to outputChannels/groups_.
- * \param[in]   filterHeight      height of filter.
- * \param[in]   filterWidth       widht of filter.
- * \param[in]   strideH           stride size in height direction.
- * \param[in]   strideW           stride size in width direction.
- * \param[in]   paddingH          padding size in height direction.
- * \param[in]   paddingW          padding size in width direction.
- * \param[in]   colData           Auxiliary data when calculating filterGrad.
- * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
- * \param[out]  filterGrad        the grad data of filter.
- *
- */
-template <DeviceType Device, class T>
-class DepthwiseConvGradFilterFunctor {
- public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOpGpu.cu b/paddle/legacy/function/DepthwiseConvOpGpu.cu
deleted file mode 100644
index 17138cc56..000000000
--- a/paddle/legacy/function/DepthwiseConvOpGpu.cu
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DepthwiseConvOp.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-
-namespace paddle {
-
-// CUDA kernel to compute the depthwise convolution forward pass
-template <class T>
-__global__ void ConvolutionDepthwiseForward(const int nthreads,
-                                            const T* const inputData,
-                                            const T* const filterData,
-                                            const int batchSize,
-                                            const int outputChannels,
-                                            const int outputHeight,
-                                            const int outputWidth,
-                                            const int inputChannels,
-                                            const int inputHeight,
-                                            const int inputWidth,
-                                            const int filterMultiplier,
-                                            const int filterHeight,
-                                            const int filterWidth,
-                                            const int strideH,
-                                            const int strideW,
-                                            const int paddingH,
-                                            const int paddingW,
-                                            T* const outputData) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if (index < nthreads) {
-    const int batch = index / outputChannels / outputHeight / outputWidth;
-    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-
-    const int c_in = c_out / filterMultiplier;
-    const T* weight = filterData + c_out * filterHeight * filterWidth;
-    T value = 0;
-    const int h_in_start = -paddingH + h_out * strideH;
-    const int w_in_start = -paddingW + w_out * strideW;
-    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
-    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
-    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
-        (w_in_end < inputWidth)) {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          const int offset =
-              ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                  inputWidth +
-              w_in;
-          value += (*weight) * inputData[offset];
-          ++weight;
-        }
-      }
-    } else {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-              (w_in < inputWidth)) {
-            const int offset =
-                ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                    inputWidth +
-                w_in;
-            value += (*weight) * inputData[offset];
-          }
-          ++weight;
-        }
-      }
-    }
-    outputData[index] = value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
-template <class T>
-__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
-                                                  const T* const top_diff,
-                                                  const T* const weight_data,
-                                                  const int num,
-                                                  const int outputChannels,
-                                                  const int outputHeight,
-                                                  const int outputWidth,
-                                                  const int inputChannels,
-                                                  const int inputHeight,
-                                                  const int inputWidth,
-                                                  const int filterMultiplier,
-                                                  const int filterHeight,
-                                                  const int filterWidth,
-                                                  const int strideH,
-                                                  const int strideW,
-                                                  const int paddingH,
-                                                  const int paddingW,
-                                                  T* const bottom_diff) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int batch = index / inputChannels / inputHeight / inputWidth;
-    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
-    const int h_in = (index / inputWidth) % inputHeight;
-    const int w_in = index % inputWidth;
-
-    const int c_out_start = c_in * filterMultiplier;
-
-    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
-    h_out_start = 0 > h_out_start ? 0 : h_out_start;
-    int h_out_end = (h_in + paddingH) / strideH;
-    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
-    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
-    w_out_start = 0 > w_out_start ? 0 : w_out_start;
-    int w_out_end = (w_in + paddingW) / strideW;
-    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
-
-    T value = 0;
-
-    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
-         c_out++) {
-      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
-        const int filter_h = h_in + paddingH - h_out * strideH;
-        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
-          const int filter_w = w_in + paddingW - w_out * strideW;
-          const int filter_offset = c_out * filterHeight * filterWidth +
-                                    filter_h * filterWidth + filter_w;
-          const int top_diff_offset =
-              ((batch * outputChannels + c_out) * outputHeight + h_out) *
-                  outputWidth +
-              w_out;
-          value += top_diff[top_diff_offset] * weight_data[filter_offset];
-        }
-      }
-    }
-    bottom_diff[index] += value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
-template <class T>
-__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
-                                                   const int nthreads,
-                                                   const T* const top_diff,
-                                                   const T* const inputData,
-                                                   const int num,
-                                                   const int outputChannels,
-                                                   const int outputHeight,
-                                                   const int outputWidth,
-                                                   const int inputChannels,
-                                                   const int inputHeight,
-                                                   const int inputWidth,
-                                                   const int filterMultiplier,
-                                                   const int filterHeight,
-                                                   const int filterWidth,
-                                                   const int strideH,
-                                                   const int strideW,
-                                                   const int paddingH,
-                                                   const int paddingW,
-                                                   T* const buffer_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-    const int kh =
-        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
-    const int kw = (index / outputHeight / outputWidth) % filterWidth;
-    const int h_in = -paddingH + h_out * strideH + kh;
-    const int w_in = -paddingW + w_out * strideW + kw;
-    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-        (w_in < inputWidth)) {
-      const int c_out =
-          index / (filterHeight * filterWidth * outputHeight * outputWidth);
-      const int c_in = c_out / filterMultiplier;
-      const int batch = num_i;
-      const int top_offset =
-          ((batch * outputChannels + c_out) * outputHeight + h_out) *
-              outputWidth +
-          w_out;
-      const int bottom_offset =
-          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
-          w_in;
-      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
-    } else {
-      buffer_data[index] = 0;
-    }
-  }
-}
-
-template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData) {
-    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
-
-    size_t blocks = (outputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        outputSize,
-        inputData,
-        filterData,
-        batchSize,
-        outputChannels,
-        outputHeight,
-        outputWidth,
-        inputChannels,
-        inputHeight,
-        inputWidth,
-        filterMultiplier,
-        filterHeight,
-        filterWidth,
-        strideH,
-        strideW,
-        paddingH,
-        paddingW,
-        outputData);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad) {
-    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
-
-    size_t blocks = (inputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseInputBackward<T>
-        // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
-                                               outputGrad,
-                                               filterData,
-                                               batchSize,
-                                               outputChannels,
-                                               outputHeight,
-                                               outputWidth,
-                                               inputChannels,
-                                               inputHeight,
-                                               inputWidth,
-                                               filterMultiplier,
-                                               filterHeight,
-                                               filterWidth,
-                                               strideH,
-                                               strideW,
-                                               paddingH,
-                                               paddingW,
-                                               inputGrad);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad) {
-    int colDataSize = outputChannels * filterHeight * filterWidth *
-                      outputHeight * outputWidth;
-
-    size_t blocks = (colDataSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
-                                1,
-                                filterGrad,
-                                false,
-                                true);
-
-    for (int i = 0; i < batchSize; i++) {
-      ConvolutionDepthwiseFilterBackward<
-          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
-                                                   colDataSize,
-                                                   outputGrad,
-                                                   inputData,
-                                                   batchSize,
-                                                   outputChannels,
-                                                   outputHeight,
-                                                   outputWidth,
-                                                   inputChannels,
-                                                   inputHeight,
-                                                   inputWidth,
-                                                   filterMultiplier,
-                                                   filterHeight,
-                                                   filterWidth,
-                                                   strideH,
-                                                   strideW,
-                                                   paddingH,
-                                                   paddingW,
-                                                   colData);
-      int K = outputHeight * outputWidth;
-      int M = colDataSize / K;
-
-      BaseMatrix colMatrix(M, K, colData, false, true);
-      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
-    }
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
-#else
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOpTest.cpp b/paddle/legacy/function/DepthwiseConvOpTest.cpp
deleted file mode 100644
index caf8f3597..000000000
--- a/paddle/legacy/function/DepthwiseConvOpTest.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ConvOpTest.h"
-
-namespace paddle {
-
-#ifdef PADDLE_WITH_CUDA
-TEST(DepthwiseConv, Forward) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConv-CPU", "DepthwiseConv-GPU", forward);
-}
-
-TEST(DepthwiseConv, BackwardInput) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradInput-CPU", "DepthwiseConvGradInput-GPU", backward_input);
-}
-
-TEST(DepthwiseConv, BackwardFilter) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradFilter-CPU", "DepthwiseConvGradFilter-GPU", backward_filter);
-}
-#endif
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-TEST(DepthwiseConv, Forward) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
-}
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/EigenGemm.cpp b/paddle/legacy/function/EigenGemm.cpp
deleted file mode 100644
index 5929c5c68..000000000
--- a/paddle/legacy/function/EigenGemm.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "paddle/legacy/function/EigenThreadDevice.h"
-
-namespace paddle {
-
-template <class T>
-struct EigenBlasGemm {
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
-                           Eigen::Aligned>
-      EigenMatrix;
-
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    Eigen::array<int, 2> sizeA;
-    if (transA) {
-      sizeA[0] = K;
-      sizeA[1] = M;
-      CHECK_EQ(M, lda);
-    } else {
-      sizeA[0] = M;
-      sizeA[1] = K;
-      CHECK_EQ(K, lda);
-    }
-    Eigen::array<int, 2> sizeB;
-    if (transB) {
-      sizeB[0] = N;
-      sizeB[1] = K;
-      CHECK_EQ(K, ldb);
-    } else {
-      sizeB[0] = K;
-      sizeB[1] = N;
-      CHECK_EQ(N, ldb);
-    }
-    Eigen::array<int, 2> sizeC = {{M, ldc}};
-    Eigen::array<int, 2> offsetC = {{0, 0}};
-    Eigen::array<int, 2> extentC = {{M, N}};
-
-    const EigenMatrix a(const_cast<T*>(A), sizeA);
-    const EigenMatrix b(const_cast<T*>(B), sizeB);
-    EigenMatrix c(C, sizeC);
-
-    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
-    Eigen::array<DimPair, 1> dims;
-    dims[0] = DimPair(1, 0);
-    dims[0].first = transA ? 0 : 1;
-    dims[0].second = transB ? 1 : 0;
-
-    auto* device = EigenDeviceWarpper::device();
-    if (N == ldc) {
-      if (alpha == T(1) && beta == T(0)) {
-        c.device(*device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.device(*device) += a.contract(b, dims);
-      } else {
-        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
-      }
-    } else {
-      if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
-      } else {
-        c.slice(offsetC, extentC).device(*device) =
-            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
-      }
-    }
-    EigenDeviceWarpper::free_device(device);
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template struct EigenBlasGemm<double>;
-#else
-template struct EigenBlasGemm<float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/EigenThreadDevice.h b/paddle/legacy/function/EigenThreadDevice.h
deleted file mode 100644
index eb92251c8..000000000
--- a/paddle/legacy/function/EigenThreadDevice.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#pragma once
-
-#if defined(__OSX__) || defined(__APPLE__)
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-
-#if defined(__ANDROID__)
-int GetCpuCount() {
-  FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
-  if (!fp) {
-    return 1;
-  }
-  int rank0, rank1;
-  int num = fscanf(fp, "%d-%d", &rank0, &rank1);
-  fclose(fp);
-  if (num < 2) return 1;
-  return rank1 + 1;
-}
-#elif defined(__OSX__) || defined(__APPLE__)
-int GetCpuCount() {
-  int count = 0;
-  size_t len = sizeof(int);
-  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-  return count > 0 ? count : 1;
-}
-#else
-int GetCpuCount() { return 1; }
-#endif
-
-class EigenDeviceWarpper {
- public:  // NOLINT
-#if EIGEN_USE_THREADS
-  static Eigen::ThreadPoolDevice* device() {
-    const int num_cpus = GetCpuCount();
-    const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
-    static Eigen::ThreadPool tp(num_threads);
-    static Eigen::ThreadPoolDevice* device =
-        new Eigen::ThreadPoolDevice(&tp, num_threads);
-    return device;
-  }
-
-  static void free_device(Eigen::ThreadPoolDevice* device) {
-    // do nothing
-  }
-#else
-  static Eigen::DefaultDevice* device() {
-    Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
-    return device;
-  }
-
-  static void free_device(Eigen::DefaultDevice* device) { delete device; }
-#endif
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Function.cpp b/paddle/legacy/function/Function.cpp
deleted file mode 100644
index 344358fd3..000000000
--- a/paddle/legacy/function/Function.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-
-namespace paddle {
-
-void BufferArgs::addArg(const Matrix& arg,
-                        const TensorShape& shape,
-                        ArgType argType) {
-  _args_.push_back(new BufferArg(arg, shape, argType));
-  addArg(*_args_.back());
-}
-
-void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
-  _args_.push_back(new SparseMatrixArg(arg, argType));
-  addArg(*_args_.back());
-}
-
-void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
-  _args_.push_back(new SparseMatrixArg(arg, argType));
-  addArg(*_args_.back());
-}
-
-void BufferArgs::addArg(const Matrix& matrix,
-                        const IVector& vector,
-                        ArgType argType) {
-  _args_.push_back(new SequenceArg(matrix, vector, argType));
-  addArg(*_args_.back());
-}
-
-ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Function.h b/paddle/legacy/function/Function.h
deleted file mode 100644
index bc5ef7e6f..000000000
--- a/paddle/legacy/function/Function.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include "BufferArg.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Any.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Error.h"
-
-namespace paddle {
-
-/**
- * Function Configuration.
- * The argument type of Function::init.
- */
-class FuncConfig {
- public:
-  template <typename T>
-  T get(const std::string& key, Error* err = nullptr) const {
-    try {
-      return any_cast<T>(valueMap_.at(key));
-    } catch (std::exception& e) {  // could be cast or out of range exception.
-      if (err) {
-        *err = Error(e.what());
-      } else {
-        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
-      }
-      return T();
-    }
-  }
-
-  template <typename T>
-  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
-    auto it = valueMap_.find(key);
-    if (it != valueMap_.end()) {  // already contains key.
-      if (err) {
-        *err = Error("Key %s is already set in FuncConfig", key.c_str());
-      } else {
-        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
-      }
-      return *this;
-    }
-    valueMap_[key] = any(v);
-    return *this;
-  }
-
- protected:
-  mutable std::unordered_map<std::string, any> valueMap_;
-};
-
-/**
- * Argument type for Function::calc().
- * A BufferArgs contains a set of BufferArg,
- * because Function can have multiple inputs and outputs.
- *
- * addArg() with Matix object used to adapt Layer Argument.
- * Will create a BufferArg object in addArg(),
- * and free in destructor of BufferArgs.
- *
- * addArg() with BufferArg object, just save BufferArg object address,
- * and the caller needs to guarantee the validity of the BufferArg object
- * in the BufferArgs life time.
- */
-class BufferArgs {
- public:
-  BufferArgs() {}
-
-  ~BufferArgs() {
-    for (auto arg : _args_) {
-      delete arg;
-    }
-  }
-
-  size_t size() const { return args_.size(); }
-
-  // add argument into BufferArgs
-  // Tensor can be Matrix, Vector, IVector.
-  // For inputs, do not need argType.
-  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  // Add arg into BufferArgs and reshape the arg.
-  //
-  // For example, arg represents an image buffer,
-  // but Matrix can only represent a two-dimensional Tensor.
-  // So need an extra argument to describe the shape of the image buffer.
-  void addArg(const Matrix& arg,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED);
-
-  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-
-  void addArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED);
-
-  // get argument
-  const BufferArg& operator[](size_t num) const {
-    CHECK_LT(num, args_.size());
-    return *args_[num];
-  }
-
-  void addArg(BufferArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
-
- private:
-  std::vector<BufferArg*> args_;
-  // The BufferArg object is constructed and freed by BufferArgs.
-  std::vector<BufferArg*> _args_;
-};
-
-/**
- * \brief Base class for Function.
- * The basic Function implementation requires override init and calc interfaces.
- *
- * The caller needs to ensure the validity of the arguments
- * during Function execution.
- *
- * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
- * and ADD_TO.
- * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
- * result of Function assigned to the output BufferArg.
- * If output.getArgType() == ADD_TO, this is add mode, and the calculation
- * result of Function need added to the output BufferArg.
- *
- * For example:
- * ASSIGN_TO: output = Function(inputs)
- * ADD_TO: output += Function(inputs)
- * If Function has more than one output, each output can have different modes.
- */
-class FunctionBase {
- public:
-  virtual ~FunctionBase() {}
-
-  virtual void init(const FuncConfig& config) {}
-
-  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // This member function is used to check whether the BufferType and shape of
-  // the inputs and outputs arguments of the Function are correct.
-  // General calc function which will call this check to do arguments check.
-  // And before the calc called, the caller can also check their own arguments.
-  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // Calculate the number of floating-point operations of this Function.
-  // The inputs and outputs arguments do not need to contain the actual data,
-  // only the shape.
-  // And some Functions have the same input and output shapes,
-  // so you may not need to enter the complete number of arguments.
-  // But entering the full arguments is always correct for this interface.
-  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
-    return 0;
-  }
-
-  int getNumInputs() const { return numInputs_; }
-
-  int getNumOutputs() const { return numOutputs_; }
-
-  static ClassRegistrar<FunctionBase> funcRegistrar_;
-
- protected:
-  // numInputs_ and numOutputs_ represents the maximum
-  // input and output supported by Function.
-  // Some functions are optimized for input and output,
-  // so when comparing the number of arguments, for these functions
-  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
-  size_t numInputs_;
-  size_t numOutputs_;
-};
-
-#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
-
-#define REGISTER_TYPED_FUNC(typeName, deviceName, className)   \
-  static InitFunction __reg_type_##typeName##deviceName([]() { \
-    FunctionBase::funcRegistrar_                               \
-        .registerClass<className<DEVICE_TYPE_##deviceName>>(   \
-            FUNC_NAME(typeName, deviceName));                  \
-  })
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.cpp b/paddle/legacy/function/FunctionTest.cpp
deleted file mode 100644
index 1a0993e31..000000000
--- a/paddle/legacy/function/FunctionTest.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-
-template <DeviceType DType>
-void FunctionApi(typename Tensor<real, DType>::Matrix& output,
-                 const typename Tensor<real, DType>::Matrix& input);
-
-template <>
-void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100U);
-  EXPECT_EQ(output.getWidth(), 200U);
-}
-
-template <>
-void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10U);
-  EXPECT_EQ(output.getWidth(), 20U);
-}
-
-template <DeviceType DType>
-void Function(const BufferArgs& arguments) {
-  const auto input = arguments[0].matrix<DType>();
-  auto output = arguments[1].matrix<DType>();
-  FunctionApi<DType>(output, input);
-}
-
-TEST(Function, BufferArgs) {
-  CpuMatrix cpuInput = CpuMatrix(100, 200);
-  CpuMatrix cpuOutput = CpuMatrix(100, 200);
-  BufferArgs cpuArgments;
-  cpuArgments.addArg(cpuInput);
-  cpuArgments.addArg(cpuOutput);
-  Function<DEVICE_TYPE_CPU>(cpuArgments);
-
-  GpuMatrix gpuInput = GpuMatrix(10, 20);
-  GpuMatrix gpuOutput = GpuMatrix(10, 20);
-  BufferArgs gpuArgments;
-  gpuArgments.addArg(gpuInput);
-  gpuArgments.addArg(gpuOutput);
-  Function<DEVICE_TYPE_GPU>(gpuArgments);
-}
-
-/**
- * Some tests case are used to check the consistency between the BufferArg type
- * argument received by Function and the original type argument.
- *
- * Use Case:
- *  TEST() {
- *    Matrix matrix(...);
- *    CheckBufferArg lambda = [=](const BufferArg& arg) {
- *      // check matrix and arg are equivalent
- *      EXPECT_EQ(matrix, arg);
- *    }
- *
- *   BufferArgs argments{matrix...};
- *   std::vector<CheckBufferArg> checkFunc{lambda...};
- *   testBufferArgs(argments, checkFunc);
- *  }
- */
-typedef std::function<void(const BufferArg&)> CheckBufferArg;
-
-void testBufferArgs(const BufferArgs& inputs,
-                    const std::vector<CheckBufferArg>& check) {
-  EXPECT_EQ(inputs.size(), check.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    check[i](inputs[i]);
-  }
-}
-
-void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
-  EXPECT_EQ(inputs.size(), 1U);
-  check(inputs[0]);
-}
-
-TEST(Arguments, Matrix) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.shape()[1], 200U);
-    EXPECT_EQ(arg.data(), matrix->getData());
-
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, Vector) {
-  VectorPtr vector = Vector::create(100, false);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 1U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.data(), vector->getData());
-
-    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*vector);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, CpuSparseMatrix) {
-  CpuSparseMatrix sparse(200, 300, 50);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 200U);
-    EXPECT_EQ(arg.shape()[1], 300U);
-    EXPECT_EQ(arg.data(), sparse.getData());
-    // CHECK_EQ(arg.sparse().nnz(), 50);
-    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
-  };
-
-  BufferArgs argments;
-  argments.addArg(sparse);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, BufferArg) {
-  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 3U);
-    EXPECT_EQ(arg.shape()[0], 1U);
-    EXPECT_EQ(arg.shape()[1], 2U);
-    EXPECT_EQ(arg.shape()[2], 3U);
-  };
-
-  BufferArgs argments;
-  argments.addArg(arg);
-  testBufferArgs(argments, check);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.h b/paddle/legacy/function/FunctionTest.h
deleted file mode 100644
index 6f01981a3..000000000
--- a/paddle/legacy/function/FunctionTest.h
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
-
-namespace test {
-template <DeviceType DType>
-struct Allocator;
-
-template <>
-struct Allocator<DEVICE_TYPE_CPU> {
-  using type = CpuMemoryHandle;
-};
-
-template <>
-struct Allocator<DEVICE_TYPE_GPU> {
-  using type = GpuMemoryHandle;
-};
-
-// Copy argument1 to argument2
-template <DeviceType DType1, DeviceType DType2>
-class CopyArgument {
- public:
-  void operator()(const BufferArg& arg1, BufferArg& arg2) {
-    CHECK_EQ(arg1.valueType(), arg2.valueType());
-    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
-
-    if (arg1.valueType() == VALUE_TYPE_INT32) {
-      IVectorPtr vector1 =
-          IVector::create((int*)arg1.data(),
-                          arg1.shape().getElements(),
-                          DType1 == DEVICE_TYPE_CPU ? false : true);
-      IVectorPtr vector2 =
-          IVector::create((int*)arg2.data(),
-                          arg2.shape().getElements(),
-                          DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    } else {
-      VectorPtr vector1 =
-          Vector::create((real*)arg1.data(),
-                         arg1.shape().getElements(),
-                         DType1 == DEVICE_TYPE_CPU ? false : true);
-      VectorPtr vector2 =
-          Vector::create((real*)arg2.data(),
-                         arg2.shape().getElements(),
-                         DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    }
-  }
-};
-}  // namespace test
-
-/**
- * \brief A class for comparing two Functions of different implementations.
- *        For example, can be used to compare the CPU and GPU implementation
- *        of the function is consistent.
- *
- * Use case:
- *  // Initializes a test object, the corresponding cpu and gpu Function
- *  // are constructed according to FunctionName and FuncConfig.
- *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
- *  // Prepare inputs and outputs arguments.
- *  // Here the input and output can not contain real data,
- *  // only contains the argument type and shape.
- *  test.addInputs(input1);
- *  test.addInputs(input2);
- *  test.addOutputs(output1);
- *  test.addOutputs(output2);
- *  // Run.
- *  // Will according to the type and shape of arguments(inputs_/outputs_),
- *  // automatic initialization cpu and gpu function required arguments
- *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
- *  // Call the CPU and GPU Function calculation results.
- *  // Compares CPU and GPU calculation results for consistency.
- *  test.run();
- */
-template <DeviceType DType1, DeviceType DType2>
-class Compare2Function {
- public:
-  typedef typename test::Allocator<DType1>::type Allocator1;
-  typedef typename test::Allocator<DType2>::type Allocator2;
-  typedef typename Tensor<real, DType1>::Vector Vector1;
-  typedef typename Tensor<real, DType2>::Vector Vector2;
-  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
-  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
-
-  Compare2Function(const std::string& name1,
-                   const std::string& name2,
-                   const FuncConfig& config)
-      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
-        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
-    function1_->init(config);
-    function2_->init(config);
-    initArgsCallback_ = nullptr;
-  }
-
-  ~Compare2Function() {}
-
-  // input need only contains shape, do not contains data.
-  void addInputs(const BufferArg& input) {
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
-    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
-  }
-
-  // assume one copy of sequence is shared by different SequenceArgs
-  void addSequence(const SequenceIdArg& input) {
-    CHECK_EQ(input.shape().ndims(), 1UL);
-    size_t batchSize = input.shape()[0];
-    size_t numSeqs = batchSize / 10 + 1;
-    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
-    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    /// init sequence Id
-    initArg(*seq1_, batchSize);
-
-    copyArg_(*seq1_, *seq2_);
-  }
-
-  void addInputs(const SequenceArg& input) {
-    CHECK_EQ(input.shape().ndims(), 2UL);
-    size_t batchSize = input.shape()[0];
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq1_));
-    func2Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq2_));
-  }
-
-  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
-    initArgsCallback_ = callback;
-  }
-
-  // output need only contains shape, do not contains data.
-  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-  }
-
-  /// add and init output sparse matrix
-  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
-  }
-
-  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
-    CHECK_EQ(output.shape().ndims(), 2UL);
-    size_t batchSize = output.shape()[0];
-
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq1_,
-                                      argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq2_,
-                                      argType));
-  }
-
-  void addInputs(const SparseMatrixArg& input) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
-    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
-  }
-
-  void run() {
-    // prepare cpu/gpu arguments
-    initInputs();
-
-    initOutputs();
-    // function calculate
-    auto callFunction = [](FunctionBase* function,
-                           std::vector<BufferArgPtr>& inputs,
-                           std::vector<BufferArgPtr>& outputs) {
-      BufferArgs inArgs;
-      BufferArgs outArgs;
-      for (auto arg : inputs) {
-        inArgs.addArg(*arg);
-      }
-      for (auto arg : outputs) {
-        outArgs.addArg(*arg);
-      }
-      function->calc(inArgs, outArgs);
-    };
-
-    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
-    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
-
-    // check outputs
-    compareOutputs();
-  }
-
-  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
-
-  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
-
- protected:
-  // only init cpu argument, gpu argument copy from cpu argument.
-  void initArg(BufferArg& arg) {
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceArg& arg) {
-    /// init only matrix
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceIdArg& arg, size_t batchSize) {
-    size_t numSeqs = arg.numSeqs();
-    int* buf = reinterpret_cast<int*>(arg.data());
-    int pos = 0;
-    size_t maxLen = 2 * batchSize / numSeqs;
-    for (int i = 0; i < (int)numSeqs; ++i) {
-      int len = 1 + uniformRandom(std::min<int64_t>(
-                        maxLen, batchSize - pos - numSeqs + i));
-      buf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = batchSize;
-  }
-
-  void initInputs() {
-    for (size_t i = 0; i < func1Inputs_.size(); i++) {
-      if (func1Inputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Inputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
-      } else {
-        initArg(*func1Inputs_[i]);
-      }
-
-      if (initArgsCallback_ != nullptr) {
-        initArgsCallback_(*func1Inputs_[i], i);
-      }
-
-      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
-    }
-  }
-
-  void initOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      if (func1Outputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Outputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
-      } else {
-        initArg(*func1Outputs_[i]);
-      }
-
-      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
-    }
-  }
-
-  void compareOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      // TODO, Need a BufferCheck used to compare the two buffers.
-      const auto cpu = func1Outputs_[i];
-      const auto gpu = func2Outputs_[i];
-      CHECK_EQ(cpu->numElements(), gpu->numElements());
-      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
-      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
-      autotest::TensorCheckErr(cpuVector, gpuVector);
-    }
-  }
-
- protected:
-  std::shared_ptr<FunctionBase> function1_;
-  std::shared_ptr<FunctionBase> function2_;
-  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
-  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
-  std::vector<BufferArgPtr> func1Inputs_;
-  std::vector<BufferArgPtr> func1Outputs_;
-  std::vector<BufferArgPtr> func2Inputs_;
-  std::vector<BufferArgPtr> func2Outputs_;
-  std::shared_ptr<SparseMatrix1> sparse1_;
-  std::shared_ptr<SparseMatrix2> sparse2_;
-  std::shared_ptr<SequenceIdArg> seq1_;
-  std::shared_ptr<SequenceIdArg> seq2_;
-  test::CopyArgument<DType1, DType2> copyArg_;
-  std::function<void(BufferArg&, size_t)> initArgsCallback_;
-};
-
-class CpuGpuFuncCompare
-    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
- public:
-  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
-      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
-
-  ~CpuGpuFuncCompare() {}
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GemmConvOp.cpp b/paddle/legacy/function/GemmConvOp.cpp
deleted file mode 100644
index 5a8131566..000000000
--- a/paddle/legacy/function/GemmConvOp.cpp
+++ /dev/null
@@ -1,522 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-#include "GemmFunctor.h"
-#include "Im2Col.h"
-#include "paddle/legacy/math/MemoryHandle.h"
-
-namespace paddle {
-
-/*
- * \brief Forward calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int K = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        K,
-                                        colData,
-                                        N,
-                                        beta,
-                                        outputData + g * outputOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-
-/*
- * \brief Forward calculation of convolution, optimized for mobile.
- */
-template <DeviceType Device>
-class GemmConvMobileFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    real* colData = NULL;
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape;
-
-    // Max col matrix width 4096, Max col matrix size 4M.
-    size_t outputHeightSteps =
-        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
-    size_t maxColWidth = outputHeightSteps * outputWidth;
-    size_t channelSteps =
-        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
-                          (size_t)1),
-                 inputChannels / groups_);
-    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-
-      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColMobileFunctor<real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    int nStride = outputHeight * outputWidth;
-    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
-    for (size_t i = 0; i < batchSize; i++) {
-      filterData = inputs[1].data<real>();
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          real beta_ = beta;
-          for (size_t ic = 0; ic < inputChannels / groups_;
-               ic += channelSteps) {
-            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
-            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
-              int height = std::min(outputHeight - oh, outputHeightSteps);
-
-              int M = outputChannels / groups_;
-              int N = height * outputWidth;
-              int K = channels * filterHeight * filterWidth;
-              // im2col
-              im2col(inputData,
-                     imShape,
-                     colData,
-                     colShape,
-                     strideH(),
-                     strideW(),
-                     paddingH(),
-                     paddingW(),
-                     dilationH(),
-                     dilationW(),
-                     channels,
-                     oh,
-                     height,
-                     N);
-
-              // gemm
-              BlasGemm<Device, real>::compute(
-                  false,
-                  false,
-                  M,
-                  N,
-                  K,
-                  1.0f,
-                  filterData + ic * filterHeight * filterWidth,
-                  kStride,
-                  colData,
-                  N,
-                  beta_,
-                  outputData + oh * outputWidth,
-                  nStride);
-            }
-            beta_ = 1.0;
-          }
-        } else {
-          int M = outputChannels / groups_;
-          int N = outputHeight * outputWidth;
-          int K = inputChannels / groups_ * filterHeight * filterWidth;
-          BlasGemm<Device, real>::compute(false,
-                                          false,
-                                          M,
-                                          N,
-                                          K,
-                                          1.0f,
-                                          filterData,
-                                          K,
-                                          inputData,
-                                          N,
-                                          beta,
-                                          outputData,
-                                          N);
-        }
-        inputData += inputOffset;
-        outputData += outputOffset;
-        filterData += filterOffset;
-      }
-    }
-
-    memory_.reset();
-  }
-};
-
-#endif
-
-/*
- * \brief Backward input calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradInputFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // Since the implementation of Col2ImFunctor is ADD_TO,
-    // this function only supports ADD_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* inputGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Col2ImFunctor<kCFO, Device, real> col2im;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        int K = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int M = inputChannels / groups_ * filterHeight * filterWidth;
-        real scale = 0.0f;
-        if (!needIm2col) {
-          colData = inputGrad + g * inputOffset;
-          scale = 1.0f;
-        }
-        BlasGemm<Device, real>::compute(true,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        M,
-                                        outputGrad + g * outputOffset,
-                                        N,
-                                        scale,
-                                        colData,
-                                        N);
-        if (needIm2col) {
-          col2im(inputGrad + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        }
-      }
-      inputGrad += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-/*
- * \brief Backward filter calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradFilterFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* inputData = inputs[1].data<real>();
-    real* filterGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int K = outputHeight * outputWidth;
-        int N = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        true,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        outputGrad + g * outputOffset,
-                                        K,
-                                        colData,
-                                        K,
-                                        i == 0 ? beta : 1.0f,
-                                        filterGrad + g * filterOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
-#else
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
-#endif
-REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
-REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GemmConvOpTest.cpp b/paddle/legacy/function/GemmConvOpTest.cpp
deleted file mode 100644
index a30b7c90b..000000000
--- a/paddle/legacy/function/GemmConvOpTest.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ConvOpTest.h"
-
-namespace paddle {
-
-TEST(GemmConv, NaiveConv) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "NaiveConv-CPU", "GemmConv-CPU", forward);
-  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "NaiveConv-CPU", "GemmConv-CPU", forward);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(GemmConv, Forward) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConv-CPU", "GemmConv-GPU", forward);
-  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConv-CPU", "GemmConv-GPU", forward);
-}
-
-TEST(GemmConv, BackwardInput) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
-  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
-}
-
-TEST(GemmConv, BackwardFilter) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
-  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
-}
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GemmFunctor.cpp b/paddle/legacy/function/GemmFunctor.cpp
deleted file mode 100644
index 450293dfe..000000000
--- a/paddle/legacy/function/GemmFunctor.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GemmFunctor.h"
-#include "paddle/legacy/math/MathFunctions.h"
-
-namespace paddle {
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_CPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-#ifdef PADDLE_USE_EIGEN_FOR_BLAS
-    EigenBlasGemm<T>::compute(
-        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
-#else
-    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
-            transB == false ? CblasNoTrans : CblasTrans,
-            M,
-            N,
-            K,
-            alpha,
-            A,
-            lda,
-            B,
-            ldb,
-            beta,
-            C,
-            ldc);
-#endif
-  }
-};
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_GPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    hl_matrix_mul((T*)A,
-                  transA == false ? HPPL_OP_N : HPPL_OP_T,
-                  (T*)B,
-                  transB == false ? HPPL_OP_N : HPPL_OP_T,
-                  C,
-                  M,
-                  N,
-                  K,
-                  alpha,
-                  beta,
-                  lda,
-                  ldb,
-                  ldc);
-  }
-};
-
-template struct BlasGemm<DEVICE_TYPE_CPU, real>;
-template struct BlasGemm<DEVICE_TYPE_GPU, real>;
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GemmFunctor.h b/paddle/legacy/function/GemmFunctor.h
deleted file mode 100644
index df63fc64f..000000000
--- a/paddle/legacy/function/GemmFunctor.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TensorType.h"
-
-namespace paddle {
-
-// TODO(hedaoyuan): Since the hl_matrix_mul interface does not conform to the
-// cblas_dgemm interface's parameter format, it is necessary to introduce
-// GemmFunctor as a new interface. Later, when considering the implementation
-// of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
-// interface.
-template <DeviceType Device, class T>
-struct BlasGemm {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc);
-};
-
-// TODO(hedaoyuan): Since the definition of the real type in the Paddle
-// conflicts with the Eigen library, so compile the Eigen code can not
-// include the Paddle header file. And need an EigenBlasGemm template class
-// that does not contain the DeviceType parameter.
-// I will fix this problem and merge BlasGemm and EigenBlasGemm into one.
-template <class T>
-struct EigenBlasGemm {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/GruFunctor.h b/paddle/legacy/function/GruFunctor.h
deleted file mode 100644
index d5a30c332..000000000
--- a/paddle/legacy/function/GruFunctor.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GemmFunctor.h"
-#include "hl_cpu_gru.cuh"
-
-namespace paddle {
-
-template <DeviceType Device, class T>
-struct GruFunctor {
-  template <class OpResetOutput, class OpFinalOutput>
-  static void compute(OpResetOutput opResetOutput,
-                      OpFinalOutput opFinalOutput,
-                      hl_gru_value value,
-                      int frameSize,
-                      int batchSize,
-                      hl_activation_mode_t active_node,
-                      hl_activation_mode_t active_gate) {
-#ifndef __NVCC__
-    if (value.prevOutValue) {
-      BlasGemm<Device, T>::compute(false,
-                                   false,
-                                   batchSize,
-                                   2 * frameSize,
-                                   frameSize,
-                                   1,
-                                   value.prevOutValue,
-                                   frameSize,
-                                   value.gateWeight,
-                                   frameSize * 2,
-                                   1,
-                                   value.gateValue,
-                                   frameSize * 3);
-    }
-
-    forward_reset_output(
-        opResetOutput, value, frameSize, batchSize, active_gate);
-
-    if (value.prevOutValue) {
-      BlasGemm<Device, T>::compute(false,
-                                   false,
-                                   batchSize,
-                                   frameSize,
-                                   frameSize,
-                                   1,
-                                   value.resetOutputValue,
-                                   frameSize,
-                                   value.stateWeight,
-                                   frameSize,
-                                   1,
-                                   value.gateValue + frameSize * 2,
-                                   frameSize * 3);
-    }
-
-    forward_final_output(
-        opFinalOutput, value, frameSize, batchSize, active_node);
-#endif
-  }
-};
-
-template <DeviceType Device, class T>
-struct GruGradFunctor {
-  template <class OpStateGrad, class OpResetGrad>
-  static void compute(OpStateGrad opStateGrad,
-                      OpResetGrad opResetGrad,
-                      hl_gru_value value,
-                      hl_gru_grad grad,
-                      int frameSize,
-                      int batchSize,
-                      hl_activation_mode_t active_node,
-                      hl_activation_mode_t active_gate) {
-#ifndef __NVCC__
-    backward_state_grad(
-        opStateGrad, value, grad, frameSize, batchSize, active_node);
-
-    if (value.prevOutValue && grad.prevOutGrad) {
-      BlasGemm<Device, T>::compute(false,
-                                   true,
-                                   batchSize,
-                                   frameSize,
-                                   frameSize,
-                                   1,
-                                   grad.gateGrad + frameSize * 2,
-                                   frameSize * 3,
-                                   value.stateWeight,
-                                   frameSize,
-                                   0,
-                                   grad.resetOutputGrad,
-                                   frameSize);
-
-      if (grad.stateWeightGrad) {
-        BlasGemm<Device, T>::compute(true,
-                                     false,
-                                     frameSize,
-                                     frameSize,
-                                     batchSize,
-                                     1,
-                                     value.resetOutputValue,
-                                     frameSize,
-                                     grad.gateGrad + frameSize * 2,
-                                     frameSize * 3,
-                                     1,
-                                     grad.stateWeightGrad,
-                                     frameSize);
-      }
-    }
-
-    backward_reset_grad(
-        opResetGrad, value, grad, frameSize, batchSize, active_gate);
-
-    if (grad.prevOutGrad && value.prevOutValue) {
-      BlasGemm<Device, T>::compute(false,
-                                   true,
-                                   batchSize,
-                                   frameSize,
-                                   frameSize * 2,
-                                   1,
-                                   grad.gateGrad,
-                                   frameSize * 3,
-                                   value.gateWeight,
-                                   frameSize * 2,
-                                   1,
-                                   grad.prevOutGrad,
-                                   frameSize);
-
-      if (grad.gateWeightGrad) {
-        BlasGemm<Device, T>::compute(true,
-                                     false,
-                                     frameSize,
-                                     frameSize * 2,
-                                     batchSize,
-                                     1,
-                                     value.prevOutValue,
-                                     frameSize,
-                                     grad.gateGrad,
-                                     frameSize * 3,
-                                     1,
-                                     grad.gateWeightGrad,
-                                     frameSize * 2);
-      }
-    }
-#endif
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Im2Col.h b/paddle/legacy/function/Im2Col.h
deleted file mode 100644
index e0ce6918a..000000000
--- a/paddle/legacy/function/Im2Col.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TensorShape.h"
-#include "TensorType.h"
-#include "neon/neon_util.h"
-
-namespace paddle {
-
-/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
-enum ColFormat { kCFO = 0, kOCF = 1 };
-
-/*
- * \brief Converts the image data of three dimensions(CHW) into a colData of
- *        five dimensions in the Im2ColFunctor calculation,
- *        And in the Col2ImFunctor calculation, it is reversed.
- *
- * \param imData   Image data.
- * \param imShape  The shape of imData,
- *                 [inputChannels, inputHeight, inputWidth].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * If the template argument Format is kCFO, the shape of colData is:
- * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * inputChannels * filterHeight * filterWidth, and the width is equal
- * outputHeight * outputWidth.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [inputChannels,
- *      filterHeight,
- *      filterWidth,      ======>      [height, width]
- *      outputHeight,
- *      outputWidth]
- *
- * If the template argument Format is kOCF, the shape of colData is:
- * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- * So, it is easy to reshape into a sequence matrix for rnn calculation.
- * The shape of sequence matrix is [seqLength, stepSize], where the seqLength
- * is equal outputHeight * outputWidth, and the stepSize is equal
- * inputChannels * filterHeight * filterWidth.
- *
- * Reshape:
- *     shape of colData             shape of sequence matrix
- *     [outputHeight,
- *      outputWidth,
- *      inputChannels,    ======>    [seqLength, stepSize]
- *      filterHeight,
- *      filterWidth]
- *
- * \note The caller needs to ensure that imShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <ColFormat Format, DeviceType Device, class T>
-class Im2ColFunctor {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1);
-};
-
-template <ColFormat Format, DeviceType Device, class T>
-class Col2ImFunctor {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1);
-};
-
-template <class T>
-class Im2ColMobileFunctor {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth,
-                  int inputChannels,
-                  int colOffset,
-                  int colOutputHeight,
-                  int colWidth) {
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputWidth = colShape[4];
-
-    for (int ic = 0; ic < inputChannels; ic++) {
-      for (int oh = 0; oh < colOutputHeight; oh++) {
-        T* dstData = colData + oh * outputWidth;
-        for (int fh = 0; fh < filterHeight; fh++) {
-          for (int fw = 0; fw < filterWidth; fw++) {
-            int imRowIdx = (oh + colOffset) * strideHeight +
-                           fh * dilationHeight - paddingHeight;
-            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
-              memset(dstData, 0, outputWidth * sizeof(T));
-            } else {
-              for (int ow = 0; ow < outputWidth; ow++) {
-                int imColIdx =
-                    ow * strideWidth + fw * dilationWidth - paddingWidth;
-                if (imColIdx < 0 || imColIdx >= inputWidth) {
-                  dstData[ow] = T(0);
-                } else {
-                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
-                }
-              }
-            }
-            dstData += colWidth;
-          }
-        }
-      }
-      colData += filterHeight * filterWidth * colWidth;
-      imData += inputHeight * inputWidth;
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Im2ColOp.cpp b/paddle/legacy/function/Im2ColOp.cpp
deleted file mode 100644
index 55a3ff98d..000000000
--- a/paddle/legacy/function/Im2ColOp.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-
-namespace paddle {
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-          int imColIdx = w * strideWidth + wOffset * dilationWidth;
-          if ((imRowIdx - paddingHeight) < 0 ||
-              (imRowIdx - paddingHeight) >= inputHeight ||
-              (imColIdx - paddingWidth) < 0 ||
-              (imColIdx - paddingWidth) >= inputWidth) {
-            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
-          } else {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            colData[(c * outputHeight + h) * outputWidth + w] =
-                imData[imRowIdx * inputWidth + imColIdx];
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-          int imColIdx = w * strideWidth + wOffset * dilationWidth;
-          if ((imRowIdx - paddingHeight) >= 0 &&
-              (imRowIdx - paddingHeight) < inputHeight &&
-              (imColIdx - paddingWidth) >= 0 &&
-              (imColIdx - paddingWidth) < inputWidth) {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            imData[imRowIdx * inputWidth + imColIdx] +=
-                colData[(c * outputHeight + h) * outputWidth + w];
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, float>;
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, double>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, float>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-    for (int outputH = 0; outputH < outputHeight; ++outputH) {
-      for (int outputW = 0; outputW < outputWidth; ++outputW) {
-        for (int channel = 0; channel < inputChannels; ++channel) {
-          for (int filterH = 0; filterH < filterHeight; ++filterH) {
-            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset = outputH * strideHeight +
-                                filterH * dilationHeight - paddingHeight;
-              int imColOffset = outputW * strideWidth +
-                                filterW * dilationWidth - paddingWidth;
-              int colDataOffset =
-                  (((outputH * outputWidth + outputW) * inputChannels +
-                    channel) *
-                       filterHeight +
-                   filterH) *
-                      filterWidth +
-                  filterW;
-              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
-                  imColOffset < 0 || imColOffset >= inputWidth) {
-                colData[colDataOffset] = float(0);
-              } else {
-                int imDataOffset =
-                    (channel * inputHeight + imRowOffset) * inputWidth +
-                    imColOffset;
-                colData[colDataOffset] = imData[imDataOffset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-    for (int outputH = 0; outputH < outputHeight; ++outputH) {
-      for (int outputW = 0; outputW < outputWidth; ++outputW) {
-        for (int channel = 0; channel < inputChannels; ++channel) {
-          for (int filterH = 0; filterH < filterHeight; ++filterH) {
-            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset = outputH * strideHeight +
-                                filterH * dilationHeight - paddingHeight;
-              int imColOffset = outputW * strideWidth +
-                                filterW * dilationWidth - paddingWidth;
-              int colDataOffset =
-                  (((outputH * outputWidth + outputW) * inputChannels +
-                    channel) *
-                       filterHeight +
-                   filterH) *
-                      filterWidth +
-                  filterW;
-              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
-                  imColOffset >= 0 && imColOffset < inputWidth) {
-                int imDataOffset =
-                    (channel * inputHeight + imRowOffset) * inputWidth +
-                    imColOffset;
-                imData[imDataOffset] += colData[colDataOffset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, float>;
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, double>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, float>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, double>;
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Im2ColOpGpu.cu b/paddle/legacy/function/Im2ColOpGpu.cu
deleted file mode 100644
index 96dd8f528..000000000
--- a/paddle/legacy/function/Im2ColOpGpu.cu
+++ /dev/null
@@ -1,464 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-#include "hl_device_functions.cuh"
-
-namespace paddle {
-
-template <class T>
-__global__ void im2col(const T* data_im,
-                       int numOuts,
-                       int height,
-                       int width,
-                       int blockH,
-                       int blockW,
-                       int strideH,
-                       int strideW,
-                       int paddingH,
-                       int paddingW,
-                       int dilationH,
-                       int dilationW,
-                       int height_col,
-                       int width_col,
-                       T* data_col) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < numOuts) {
-    int w_out = index % width_col;
-    index /= width_col;
-    int h_out = index % height_col;
-    int channel_in = index / height_col;
-    int channel_out = channel_in * blockH * blockW;
-    int h_in = h_out * strideH;
-    int w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (int i = 0; i < blockH; ++i) {
-      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in + i * dilationH);
-        int cIdx = int(w_in + j * dilationW);
-        if ((rIdx - (int)paddingH) >= (int)height ||
-            (rIdx - (int)paddingH) < 0 ||
-            (cIdx - (int)paddingW) >= (int)width ||
-            (cIdx - (int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in * height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx * width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-
-    int numKernels = inputChannels * outputHeight * outputWidth;
-    int blocks = (numKernels + 1024 - 1) / 1024;
-    int blockX = 512;
-    int blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    im2col<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-                                                    numKernels,
-                                                    inputHeight,
-                                                    inputWidth,
-                                                    filterHeight,
-                                                    filterWidth,
-                                                    strideHeight,
-                                                    strideWidth,
-                                                    paddingHeight,
-                                                    paddingWidth,
-                                                    dilationHeight,
-                                                    dilationWidth,
-                                                    outputHeight,
-                                                    outputWidth,
-                                                    colData);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template <class T>
-__global__ void col2im(size_t n,
-                       const T* data_col,
-                       size_t height,
-                       size_t width,
-                       size_t channels,
-                       size_t blockH,
-                       size_t blockW,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t paddingH,
-                       size_t paddingW,
-                       size_t dilationH,
-                       size_t dilationW,
-                       size_t height_col,
-                       size_t width_col,
-                       T* data_im) {
-  size_t index =
-      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    T val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    int filterH = (blockH - 1) * dilationH + 1;
-    int filterW = (blockW - 1) * dilationW + 1;
-
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width - 2 * paddingW) &&
-        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
-      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int h_k = (h - h_col * strideH);
-          int w_k = (w - w_col * strideW);
-          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
-            h_k /= dilationH;
-            w_k /= dilationW;
-            int c_col =
-                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
-                    width_col +
-                w_col;
-            val += data_col[c_col];
-          }
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) +
-              h * (width - 2 * paddingW) + w] += val;
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-
-    size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) *
-                        (inputWidth + 2 * paddingWidth);
-
-    size_t blocks = (numKernels + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    // To avoid involving atomic operations, we will launch one kernel per
-    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        numKernels,
-        colData,
-        inputHeight + 2 * paddingHeight,
-        inputWidth + 2 * paddingWidth,
-        inputChannels,
-        filterHeight,
-        filterWidth,
-        strideHeight,
-        strideWidth,
-        paddingHeight,
-        paddingWidth,
-        dilationHeight,
-        dilationWidth,
-        outputHeight,
-        outputWidth,
-        imData);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
-
-template <class T>
-__global__ void im2colOCF(const T* imData,
-                          T* colData,
-                          int inputChannels,
-                          int inputHeight,
-                          int inputWidth,
-                          int filterHeight,
-                          int filterWidth,
-                          int strideHeight,
-                          int strideWidth,
-                          int paddingHeight,
-                          int paddingWidth,
-                          int dilationHeight,
-                          int dilationWidth,
-                          int outputHeight,
-                          int outputWidth) {
-  int swId = blockIdx.x;
-  int shId = blockIdx.y;
-  for (int channelId = threadIdx.z; channelId < inputChannels;
-       channelId += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset =
-            idx * dilationHeight + swId * strideWidth - paddingWidth;
-        int heightOffset =
-            idy * dilationWidth + shId * strideHeight - paddingHeight;
-        int imOffset = widthOffset + heightOffset * inputWidth +
-                       channelId * inputHeight * inputWidth;
-
-        int colOffset = idx + idy * filterWidth +
-                        channelId * filterHeight * filterWidth +
-                        (shId * outputWidth + swId) *
-                            (inputChannels * filterHeight * filterWidth);
-
-        if (heightOffset >= inputHeight || heightOffset < 0 ||
-            widthOffset >= inputWidth || widthOffset < 0) {
-          colData[colOffset] = T(0);
-        } else {
-          colData[colOffset] = imData[imOffset];
-        }
-      }
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-
-    int blockDimX = 0;
-    int blockDimY = 0;
-    if (filterHeight <= 4 && filterWidth <= 4) {
-      blockDimX = 4;
-      blockDimY = 4;
-    } else if (filterHeight <= 8 && filterWidth <= 8) {
-      blockDimX = 8;
-      blockDimY = 8;
-    } else if (filterHeight <= 16 && filterWidth <= 16) {
-      blockDimX = 16;
-      blockDimY = 16;
-    } else {
-      blockDimX = 32;
-      blockDimY = 32;
-    }
-
-    int blockDimZ = 1024 / blockDimX / blockDimY;
-    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
-    dim3 grid(outputWidth, outputHeight);
-    im2colOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-                                                       colData,
-                                                       inputChannels,
-                                                       inputHeight,
-                                                       inputWidth,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       strideHeight,
-                                                       strideWidth,
-                                                       paddingHeight,
-                                                       paddingWidth,
-                                                       dilationHeight,
-                                                       dilationWidth,
-                                                       outputHeight,
-                                                       outputWidth);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template <class T>
-__global__ void col2imOCF(T* imData,
-                          const T* colData,
-                          int inputChannels,
-                          int inputHeight,
-                          int inputWidth,
-                          int filterHeight,
-                          int filterWidth,
-                          int strideHeight,
-                          int strideWidth,
-                          int paddingHeight,
-                          int paddingWidth,
-                          int dilationHeight,
-                          int dilationWidth,
-                          int outputHeight,
-                          int outputWidth) {
-  int swId = blockIdx.x;
-  int shId = blockIdx.y;
-  for (int channelId = threadIdx.z; channelId < inputChannels;
-       channelId += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset =
-            idx * dilationWidth + swId * strideWidth - paddingWidth;
-        int heightOffset =
-            idy * dilationHeight + shId * strideHeight - paddingHeight;
-        int imOffset = widthOffset + heightOffset * inputWidth +
-                       channelId * inputHeight * inputWidth;
-
-        int colOffset = idx + idy * filterWidth +
-                        channelId * filterHeight * filterWidth +
-                        (shId * outputWidth + swId) *
-                            (inputChannels * filterHeight * filterWidth);
-
-        if (heightOffset >= 0 && heightOffset < inputHeight &&
-            widthOffset >= 0 && widthOffset < inputWidth) {
-          paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
-        }
-      }
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
- public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-
-    int blockDimX = 0;
-    int blockDimY = 0;
-    if (filterHeight <= 4 && filterWidth <= 4) {
-      blockDimX = 4;
-      blockDimY = 4;
-    } else if (filterHeight <= 8 && filterWidth <= 8) {
-      blockDimX = 8;
-      blockDimY = 8;
-    } else if (filterHeight <= 16 && filterWidth <= 16) {
-      blockDimX = 16;
-      blockDimY = 16;
-    } else {
-      blockDimX = 32;
-      blockDimY = 32;
-    }
-
-    int blockDimZ = 1024 / blockDimX / blockDimY;
-    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
-    dim3 grid(outputWidth, outputHeight);
-    col2imOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-                                                       colData,
-                                                       inputChannels,
-                                                       inputHeight,
-                                                       inputWidth,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       strideHeight,
-                                                       strideWidth,
-                                                       paddingHeight,
-                                                       paddingWidth,
-                                                       dilationHeight,
-                                                       dilationWidth,
-                                                       outputHeight,
-                                                       outputWidth);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/Im2ColTest.cpp b/paddle/legacy/function/Im2ColTest.cpp
deleted file mode 100644
index 2c5f06f38..000000000
--- a/paddle/legacy/function/Im2ColTest.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-#include <gtest/gtest.h>
-#include "Function.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/tests/TensorCheck.h"
-
-namespace paddle {
-
-template <DeviceType Device, class T>
-void TestIm2ColFunctor() {
-  for (size_t channels : {1, 5, 32}) {
-    for (size_t inputHeight : {5, 33, 100}) {
-      for (size_t inputWidth : {5, 32, 96}) {
-        for (size_t filterHeight : {1, 5}) {
-          for (size_t filterWidth : {3, 7}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-                  TensorShape colShape2 = TensorShape({outputHeight,
-                                                       outputWidth,
-                                                       channels,
-                                                       filterHeight,
-                                                       filterWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(width, height, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, Device, T> im2Col1;
-                  Im2ColFunctor<kOCF, Device, T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-
-                  // The transposition of the result of ColFormat == kCFO
-                  // is equal to the result of ColFormat == kOCF.
-                  MatrixPtr test;
-                  output2->transpose(test, true);
-                  autotest::TensorCheckErr(*output1, *test);
-
-                  Col2ImFunctor<kCFO, Device, T> col2Im1;
-                  Col2ImFunctor<kOCF, Device, T> col2Im2;
-
-                  col2Im1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  col2Im2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  autotest::TensorCheckErr(*input1, *input2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
-
-#ifdef PADDLE_WITH_CUDA
-
-TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
-
-#endif
-
-template <class T>
-void TestIm2ColMobileFunctor() {
-  for (size_t channels : {32}) {
-    for (size_t inputHeight : {33, 100}) {
-      for (size_t inputWidth : {32, 96}) {
-        for (size_t filterHeight : {5}) {
-          for (size_t filterWidth : {7}) {
-            for (size_t stride : {2}) {
-              for (size_t padding : {1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(height, width, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
-                  Im2ColMobileFunctor<T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation,
-                          channels,
-                          0,
-                          outputHeight,
-                          outputHeight * outputWidth);
-
-                  autotest::TensorCheckEqual(*output1, *output2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.cpp b/paddle/legacy/function/MulOp.cpp
deleted file mode 100644
index 750978fc9..000000000
--- a/paddle/legacy/function/MulOp.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "GemmFunctor.h"
-#include "paddle/legacy/math/SIMDFunctions.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace {
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
-  }
-}
-}  // namespace
-
-namespace paddle {
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* A = a.getData();
-  const real* B = b.getData();
-  real* C = out.getValue();
-  int* rows = out.getRows();
-  int* cols = out.getCols();
-  size_t width = out.getWidth();
-  size_t height = out.getHeight();
-
-  /// SPARSE_CSC, {a any, b not trans}
-  if (out.getFormat() == SPARSE_CSC) {
-    /// b not trans and a any
-    CHECK(!bTrans);
-    size_t m = !aTrans ? a.getWidth() : a.getHeight();
-    for (size_t i = 0; i < width; i++) {
-      size_t start = out.getColStartIdx(i);
-      size_t end = out.getColStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t rowIdx = rows[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
-                 B[k * width + i];
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
-  if (out.getFormat() == SPARSE_CSR) {
-    /// a and b can not both transpose
-    CHECK(!(aTrans && bTrans));
-    size_t m = a.getWidth();
-    for (size_t i = 0; i < height; i++) {
-      size_t start = out.getRowStartIdx(i);
-      size_t end = out.getRowStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t colIdx = cols[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
-                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-}
-
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      aTrans,
-      bTrans,
-      out.getHeight(),
-      out.getWidth(),
-      !aTrans ? a.getWidth() : a.getHeight(),
-      scaleAB,
-      a.getData(),
-      a.getStride(),
-      b.getData(),
-      b.getStride(),
-      scaleT,
-      out.getData(),
-      out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuSparseMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* B = b.getData();
-  real* C = out.getData();
-  if (out.getWidth() % 32 == 0) {
-    CHECK_EQ((size_t)B % 32, 0UL);
-    CHECK_EQ((size_t)C % 32, 0UL);
-  }
-
-  int* cols = a.getCols();
-  real* values = a.getValue();
-  for (size_t i = 0; i < a.getHeight(); ++i) {
-    const int start = a.getRowStartIdx(i);
-    const int end = a.getRowStartIdx(i + 1);
-    for (int j = start; j < end; ++j) {
-      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
-               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
-                       : const_cast<CpuMatrix&>(b).getRow(i),
-               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
-               out.getWidth());
-    }
-  }
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  real* A = const_cast<real*>(a.getData());
-  real* B = const_cast<real*>(b.getValue());
-  real* C = out.getData();
-  int* rows = b.getRows();
-  int* cols = b.getCols();
-
-  /// SPARSE_CSC format
-  if (b.getFormat() == SPARSE_CSC) {
-    for (size_t j = 0; j < b.getWidth(); ++j) {
-      int start = b.getColStartIdx(j);
-      int end = b.getColStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + j : C + rows[i],
-                    !bTrans ? A + rows[i] : A + j,
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR format
-  if (b.getFormat() == SPARSE_CSR) {
-    for (size_t j = 0; j < b.getHeight(); ++j) {
-      int start = b.getRowStartIdx(j);
-      int end = b.getRowStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + cols[i] : C + j,
-                    !bTrans ? A + j : A + cols[i],
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-}
-
-/**
- * mul operator
- * out = scaleT * out + scaleAB * (A * B)
- * here, scaleT in {0, 1}, scaleAB == 1,
- * out = A * B, ASSIGN_TO
- * out += A * B, ADD_TO
- *
- *
- * \param outputs[0]      output matrix (out), M * N,
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, N is num of columns
- * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, K is num of columns
- * \param inputs[1]       second input matrix (B), K * N (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        K is num of rows, N is num of columns
- *
- * Support eight Mul operators, with both GPU and CPU devices
- * For each device, four Mul operators are supported:
- * 1. dense (out) = dense (A) * dense (B)
- * 2. dense (out) = sparse (A) * dense (B)
- *    sparse matrix only support SPARSE_CSR format
- * 3. dense (out) = dense (A) * sparse (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- * 4. sparse (out) = dense (A) * dense (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- *
- */
-template <DeviceType Device>
-class MulFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    aTrans_ = config.get<bool>("aTrans");
-    bTrans_ = config.get<bool>("bTrans");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(!aTrans_ || !bTrans_)
-        << "Not support both a and b are transpose matrices";
-
-    CHECK_EQ((size_t)2, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
-    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-
-    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
-    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
-    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
-    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
-    /// C = A * B, or C += A * B, for matrix format
-    CHECK_EQ(aCol, bRow);
-    CHECK_EQ(aRow, outputs[0].shape()[0]);
-    CHECK_EQ(bCol, outputs[0].shape()[1]);
-
-    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
-    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
-
-    /// support dense = not both sparse * sparse
-    /// or sparse = dense * dense
-    CHECK((!outputs[0].isSparseArg() &&
-           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
-          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
-           !inputs[1].isSparseArg()));
-
-    auto outMat = outputs[0].matrix<Device>();
-    /// dense matrix = dense matrix * dense matrix
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = dense matrix * sparse matrix
-    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!aTrans_) << "Not supported a transpose";
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].sparse().SparseMatrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = sparse matrix * dense matrix
-    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!bTrans_) << "Not supported b transpose";
-      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
-          << "Only supported SPARSE_CSR format for sparse matrix a";
-      MulOp<Device>(outMat,
-                    inputs[0].sparse().SparseMatrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// sparse matrix = dense matrix * dense matrix
-    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        outputs[0].isSparseArg()) {
-      MulOp<Device>(outSparseMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-  }
-
- private:
-  bool aTrans_;
-  bool bTrans_;
-};
-
-REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.h b/paddle/legacy/function/MulOp.h
deleted file mode 100644
index ab33bde17..000000000
--- a/paddle/legacy/function/MulOp.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-/// CPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuSparseMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuSparseMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuSparseMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpGpu.cu b/paddle/legacy/function/MulOpGpu.cu
deleted file mode 100644
index 217c983cb..000000000
--- a/paddle/legacy/function/MulOpGpu.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "hl_base.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_mul(const_cast<real*>(a.getData()),
-                !aTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(b.getData()),
-                !bTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(out.getData()),
-                out.getHeight(),
-                out.getWidth(),
-                !aTrans ? a.getWidth() : a.getHeight(),
-                scaleAB,
-                scaleT,
-                a.getStride(),
-                b.getStride(),
-                out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuSparseMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
-                          aTrans ? HPPL_OP_T : HPPL_OP_N,
-                          const_cast<real*>(b.getData()),
-                          HPPL_OP_N,
-                          const_cast<real*>(out.getData()),
-                          out.getHeight(),
-                          out.getWidth(),
-                          b.getHeight(),
-                          scaleAB,
-                          scaleT);
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  }
-}
-
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
-                       aTrans ? HPPL_OP_T : HPPL_OP_N,
-                       const_cast<real*>(b.getData()),
-                       bTrans ? HPPL_OP_T : HPPL_OP_N,
-                       out.sMatrix_.get(),
-                       out.getHeight(),
-                       out.getWidth(),
-                       !bTrans ? b.getHeight() : b.getWidth(),
-                       scaleAB,
-                       scaleT);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpTest.cpp b/paddle/legacy/function/MulOpTest.cpp
deleted file mode 100644
index ab08b6f86..000000000
--- a/paddle/legacy/function/MulOpTest.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/math/tests/test_matrixUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-/**
- *  C += A * B, A, B, C dense matrix
- *  dense = dense * dense
- */
-void testFuncDDDMatrix(
-    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
-  real scaleT = 1.0;
-  size_t heightA = (transa == false) ? dimM : dimK;
-  size_t widthA = (transa == false) ? dimK : dimM;
-  size_t heightB = (transb == false) ? dimK : dimN;
-  size_t widthB = (transb == false) ? dimN : dimK;
-  size_t heightC = dimM;
-  size_t widthC = dimN;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
-  // prepare input arguments
-  /// matrix A : HA * WA
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
-  /// matrix B: HB * WB
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
-
-  /// output matrix C: HC * WC
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDDMatrixMul) {
-  LOG(INFO) << "function test for dense = dense * dense matrix";
-  for (const auto transa : {false, true}) {
-    for (const auto transb : {false, true}) {
-      for (const auto dimM : {1, 10, 100}) {
-        for (const auto dimN : {1, 10}) {
-          for (const auto dimK : {8}) {
-            if (transa && transb) {
-              continue;
-            }
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK;
-            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, B, C dense, A sparse
- * dense = sparse * dense
- */
-void testFuncDSparseDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// sparse matrix A : M * K
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MuLOp, DSparseDMul) {
-  LOG(INFO) << "function test for dense = sparse * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A, C dense, B sparse
- * dense = dense * sparse
- */
-void testFuncDDSparseMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDSparseMul) {
-  LOG(INFO) << "function test for dense = dense * sparse matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A sparse, B, C dense
- * sparse = dense * dense
- */
-void testFuncSparseDDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output sparse matrix C: M * N
-  test.addOutputs(
-      SparseMatrixArg(
-          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
-      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, SparseDDMul) {
-  LOG(INFO) << "function test for sparse = dense * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/function/NaiveConvOp.cpp b/paddle/legacy/function/NaiveConvOp.cpp
deleted file mode 100644
index 99c8b81ac..000000000
--- a/paddle/legacy/function/NaiveConvOp.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-
-namespace paddle {
-
-/*
- * The three arguments are stored in memory in row major order.
- * inputData  = [batchSize, inputChannels, inputHeight, inputWidth]
- * filterData = [outputChannels, inputChannels, filterHeight, filterWidth]
- * outputData = [batchSize, outputChannels, outputHeight, outputWidth]
- */
-template <class T>
-class NaiveConvFunctor {
- public:
-  void operator()(const T* inputData,
-                  size_t batchSize,
-                  size_t inputChannels,
-                  size_t inputHeight,
-                  size_t inputWidth,
-                  const T* filterData,
-                  size_t filterHeight,
-                  size_t filterWidth,
-                  T* outputData,
-                  size_t outputChannels,
-                  size_t outputHeight,
-                  size_t outputWidth,
-                  size_t paddingH,
-                  size_t paddingW,
-                  size_t strideH,
-                  size_t strideW) {
-    for (size_t batch = 0; batch < batchSize; batch++) {
-      for (size_t outC = 0; outC < outputChannels; outC++) {
-        for (size_t outH = 0; outH < outputHeight; outH++) {
-          for (size_t outW = 0; outW < outputWidth; outW++) {
-            const int inStartH = (outH * strideH) - paddingH;
-            const int inStartW = (outW * strideW) - paddingW;
-            T outValue = (T)0;
-            for (size_t inC = 0; inC < inputChannels; inC++) {
-              for (size_t fH = 0; fH < filterHeight; fH++) {
-                for (size_t fW = 0; fW < filterWidth; fW++) {
-                  T inValue;
-                  const int inH = inStartH + fH;
-                  const int inW = inStartW + fW;
-                  if ((inH >= 0 && inH < (int)inputHeight) &&
-                      (inW >= 0 && inW < (int)inputWidth)) {
-                    size_t offsetInput =
-                        batch * inputChannels * inputHeight * inputWidth +
-                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
-                    inValue = inputData[offsetInput];
-                  } else {
-                    inValue = (T)0;
-                  }
-                  size_t offsetFilter =
-                      outC * inputChannels * filterHeight * filterWidth +
-                      inC * filterHeight * filterWidth + fH * filterWidth + fW;
-                  T filterValue = filterData[offsetFilter];
-                  outValue += (inValue * filterValue);
-                }
-              }
-            }
-
-            size_t offset =
-                batch * outputChannels * outputHeight * outputWidth +
-                outC * outputHeight * outputWidth + outH * outputWidth + outW;
-            outputData[offset] = outValue;
-          }
-        }
-      }
-    }
-  }
-};
-
-template <DeviceType Device>
-class NaiveConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    check(inputs, outputs);
-
-    size_t batchSize = inputs[0].shape()[0];
-    size_t inputChannels = inputs[0].shape()[1];
-    size_t inputHeight = inputs[0].shape()[2];
-    size_t inputWidth = inputs[0].shape()[3];
-    size_t filterHeight = inputs[1].shape()[2];
-    size_t filterWidth = inputs[1].shape()[3];
-    size_t outputChannels = outputs[0].shape()[1];
-    size_t outputHeight = outputs[0].shape()[2];
-    size_t outputWidth = outputs[0].shape()[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    NaiveConvFunctor<real> conv;
-    conv(inputData,
-         batchSize,
-         inputChannels,
-         inputHeight,
-         inputWidth,
-         filterData,
-         filterHeight,
-         filterWidth,
-         outputData,
-         outputChannels,
-         outputHeight,
-         outputWidth,
-         paddingH(),
-         paddingW(),
-         strideH(),
-         strideW());
-  }
-};
-
-REGISTER_TYPED_FUNC(NaiveConv, CPU, NaiveConvFunction);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/PadOp.cpp b/paddle/legacy/function/PadOp.cpp
deleted file mode 100644
index 9d011d28e..000000000
--- a/paddle/legacy/function/PadOp.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadOp.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Pad<DEVICE_TYPE_CPU>(real* outputs,
-                          const real* inputs,
-                          const int num,
-                          const int inC,
-                          const int inH,
-                          const int inW,
-                          const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
-                              const real* outGrad,
-                              const int num,
-                              const int inC,
-                              const int inH,
-                              const int inW,
-                              const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        CpuVector inG = CpuVector(inW, inGrad + inoff);
-        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
-        inG += outG;
-      }
-    }
-  }
-}
-
-static inline PadConf castToPadConf(const FuncConfig& conf) {
-  return {conf.get<std::vector<uint32_t>>("channel"),
-          conf.get<std::vector<uint32_t>>("height"),
-          conf.get<std::vector<uint32_t>>("width")};
-}
-
-/**
- * \brief Padding zeros to input according to the specify dimension.
- *        The struct pad_ contains the padding size in each dimension.
- *        The input and output is a 4D tensor. In PadFunc, we only
- *        pad zeros to the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the padding size in each dimension.
- *                It has six integers. The channelStart and channelEnd indicate
- *                how many zeros to add before and after the input in channel
- *                dimension. And the heightStart and heightEnd indicate padding
- *                in height dimension. The widthStart and widthEnd indicate the
- *                padding in width dimension.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after padding.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the shape is (1,2,2,3)
- *
- * pad_: if channelStart = channelEnd = 1, others are 0.
- * Output(2,4,2,3) = [
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]],
- *                      [[0,0,0], [0,0,0]] ],
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]],
- *                      [[0,0,0], [0,0,0]] ]
- *                   ] # the shape is (2,4,2,3)
- *
- * pad_: if widthStart = 1, widthEnd = 2, others are 0.
- * Output(2,2,2,6) = [
- *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
- *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
- *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
- *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
- *                   ] # the shape is (2,2,2,6)
- *
- * pad_: if heightStart = 1, heightEnd = 1, others are 0.
- * Output(2,2,4,3) = [
- *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
- *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
- *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
- *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
- *                   ] # the shape is (2,2,4,3)
- */
-
-template <DeviceType Device>
-class PadFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
-                                              outputs[0].data<real>());
-    vec.zero();
-
-    Pad<Device>(outputs[0].data<real>(),
-                inputs[0].data<real>(),
-                num,
-                inC,
-                inH,
-                inW,
-                pad_);
-  }
-
- private:
-  PadConf pad_;
-};
-
-/**
- * \brief The backward propagation of padding Function. Remove the elements
- *        in the padding positions of forward.
- *
- * Argument in this Function:
- * \param pad_    The same meaning as it in PadFunc.
- * \param inputs  The gradient with respect to the output value of PadFunc.
- * \param outputs The gradient with respect to the input value of PadFunc.
- */
-
-template <DeviceType Device>
-class PadGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = outputs[0].shape()[0];
-    size_t inC = outputs[0].shape()[1];
-    size_t inH = outputs[0].shape()[2];
-    size_t inW = outputs[0].shape()[3];
-
-    if (outputs[0].getArgType() != ADD_TO) {
-      // for unit test
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    PadGrad<Device>(outputs[0].data<real>(),
-                    inputs[0].data<real>(),
-                    num,
-                    inC,
-                    inH,
-                    inW,
-                    pad_);
-  }
-
- private:
-  PadConf pad_;
-};
-
-REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/PadOp.h b/paddle/legacy/function/PadOp.h
deleted file mode 100644
index 4b0aa4014..000000000
--- a/paddle/legacy/function/PadOp.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-struct PadConf {
-  /// how many values to add before/after the data along channel dimension.
-  std::vector<uint32_t> channel;
-  /// how many values to add before/after the data along height dimension.
-  std::vector<uint32_t> height;
-  /// how many values to add before/after the data along width dimension.
-  std::vector<uint32_t> width;
-};
-
-/**
- * \brief  This funtion pads zeros to inputs according to the specify dimension.
- *         The input and output is a 4D tensor. Padding zeros from the 2nd to
- *         the 4th dimenstion according argument of pad.
- *
- * \param[out] outputs save results.
- * \param[in]  inputs  input data.
- * \param[in]  num     batch size of input data.
- * \param[in]  inC     channel number of input data.
- * \param[in]  inH     height of input data.
- * \param[in]  inH     with of input data.
- * \param[in]  pad     the padding config, contains the size along the
- *                     specify dimension.
- */
-template <DeviceType Device>
-void Pad(real* outputs,
-         const real* inputs,
-         const int num,
-         const int inC,
-         const int inH,
-         const int inW,
-         const PadConf& pad);
-
-/**
- * \brief   Padding operation backward.
- *
- * \param[out] inGrad  gradients of previous layer.
- * \param[in]  outGrad output gradients.
- * \param[in]  num     batch size of input data.
- * \param[in]  inC     channel number of input data.
- * \param[in]  inH     height of input data.
- * \param[in]  inH     with of input data.
- * \param[in]  pad     the padding config, contains the size along the
- *                     specify dimension.
- */
-template <DeviceType Device>
-void PadGrad(real* inGrad,
-             const real* outGrad,
-             const int num,
-             const int inC,
-             const int inH,
-             const int inW,
-             const PadConf& pad);
-}  // namespace paddle
diff --git a/paddle/legacy/function/PadOpGpu.cu b/paddle/legacy/function/PadOpGpu.cu
deleted file mode 100644
index 01d9b5c3b..000000000
--- a/paddle/legacy/function/PadOpGpu.cu
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KePad(real* outputs,
-                      const real* inputs,
-                      int inC,
-                      int inH,
-                      int inW,
-                      int padc,
-                      int padh,
-                      int padw,
-                      int outC,
-                      int outH,
-                      int outW,
-                      int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % inW;
-    const int h = (idx / inW) % inH;
-    const int c = (idx / inW / inH) % inC;
-    const int n = idx / inW / inH / inC;
-
-    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
-    outputs[off] = inputs[idx];
-  }
-}
-
-template <>
-void Pad<DEVICE_TYPE_GPU>(real* outputs,
-                          const real* inputs,
-                          const int num,
-                          const int inC,
-                          const int inH,
-                          const int inW,
-                          const PadConf& pad) {
-  size_t nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + 1024 - 1) / 1024;
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
-                                                    inputs,
-                                                    inC,
-                                                    inH,
-                                                    inW,
-                                                    cstart,
-                                                    hstart,
-                                                    wstart,
-                                                    outC,
-                                                    outH,
-                                                    outW,
-                                                    nth);
-  CHECK_SYNC("Pad");
-}
-
-__global__ void KePadDiff(real* inGrad,
-                          const real* outGrad,
-                          int inC,
-                          int inH,
-                          int inW,
-                          int padc,
-                          int padh,
-                          int padw,
-                          int outC,
-                          int outH,
-                          int outW,
-                          int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % inW;
-    const int h = (idx / inW) % inH;
-    const int c = (idx / inW / inH) % inC;
-    const int n = idx / inW / inH / inC;
-
-    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
-    inGrad[idx] += outGrad[off];
-  }
-}
-
-template <>
-void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
-                              const real* outGrad,
-                              const int num,
-                              const int inC,
-                              const int inH,
-                              const int inW,
-                              const PadConf& pad) {
-  int nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + 1024 - 1) / 1024;
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  KePadDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
-                                                        outGrad,
-                                                        inC,
-                                                        inH,
-                                                        inW,
-                                                        cstart,
-                                                        hstart,
-                                                        wstart,
-                                                        outC,
-                                                        outH,
-                                                        outW,
-                                                        nth);
-  CHECK_SYNC("PadGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/PadOpTest.cpp b/paddle/legacy/function/PadOpTest.cpp
deleted file mode 100644
index a4474f854..000000000
--- a/paddle/legacy/function/PadOpTest.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(Pad, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {1, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          for (bool test_grad : {false, true}) {
-            CpuGpuFuncCompare compare(
-                test_grad ? "PadGrad" : "Pad",
-                FuncConfig()
-                    .set<std::vector<uint32_t>>("channel", {2, 3})
-                    .set<std::vector<uint32_t>>("height", {1, 2})
-                    .set<std::vector<uint32_t>>("width", {3, 2}));
-            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-            TensorShape outDims{
-                numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
-            compare.addInputs(
-                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
-            compare.addOutputs(BufferArg(
-                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
-            compare.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/RowConvOp.cpp b/paddle/legacy/function/RowConvOp.cpp
deleted file mode 100644
index 3be50e80d..000000000
--- a/paddle/legacy/function/RowConvOp.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvOp.h"
-#include <iostream>
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                              const CpuMatrix& in,
-                              const CpuMatrix& filter,
-                              const CpuIVector& seq) {
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  for (size_t i = 0; i < numSeq; ++i) {
-    size_t begin = starts[i];
-    size_t end = starts[i + 1];
-    for (size_t j = begin; j < end; ++j) {
-      MatrixPtr x;
-      MatrixPtr w;
-      if ((j + contextLength) < end) {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
-      } else {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
-      }
-      MatrixPtr y = out.subMatrix(j, 1);
-      y->addDotMulVMM(*x, *w);
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
-                                  const CpuMatrix& in,
-                                  const CpuMatrix& filter,
-                                  CpuMatrix& inG,
-                                  CpuMatrix& filterG,
-                                  const CpuIVector& seq) {
-  // gradient w.r.t filter
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  if (filterG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
-        MatrixPtr x =
-            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
-        MatrixPtr dy =
-            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
-        MatrixPtr dw = filterG.subMatrix(j, 1);
-        dw->addDotMulVMM(*dy, *x);
-      }
-    }
-  }
-
-  // gradient w.r.t input feature
-  if (inG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < steps; ++j) {
-        MatrixPtr dx = inG.subMatrix(begin + j, 1);
-        for (size_t t = 0; t < contextLength; ++t) {
-          if (int(j - t) >= 0) {
-            MatrixPtr dy =
-                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
-            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
-            dx->addDotMul(*dy, *w, 1.0, 1.0);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief The row convolution is called lookahead convolution. It is firstly
- * introduced in deep-speech2 system. The bidirectional RNN that learns
- * representation for a sequence by performing a forward and a backward pass
- * through the entire sequence. However, unlike unidirectional RNNs,
- * bidirectional RNNs are challenging to deploy in an online and low-latency
- * setting. The lookahead convolution incorporates information from future
- * subsequences in a computationally efficient manner to improve unidirectional
- * recurrent neural networks.
- *
- * The connection of row convolution is different form the 1D sequence
- * convolution. Assumed that, the future context-length is k, that is to say,
- * it can get the output at timestep t by using the the input feature from t-th
- * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
- * activations are d, the activations r_t for the new layer at time-step t are:
- *
- *
- *            -- k + 1
- *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
- *            -- j = 1
- *
- *
- * The weight shape is: (k + 1) x d
- * Function Arguments:
- *
- * \param inputs[0]  The input activations.
- * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[1] The output activations.
- *
- * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
- * English
- *     and Mandarin. https://arxiv.org/abs/1512.02595
- */
-
-template <DeviceType Device>
-class RowConvFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    // TODO(qingqing): support ASSIGN_TO.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto w = inputs[1];
-    CHECK(in.data() && out.data() && in.getSequenceId().data());
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == out.shape());
-    CHECK_EQ(w.shape()[1], in.shape()[1]);
-
-    auto outMat = out.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConv<Device>(outMat, inMat, wMat, seqId);
-  }
-};
-
-/**
- * \brief The backward of row convolution function. This function calculated
- * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
- *
- * Argument in this Function:
- *
- * \param inputs[0]  The gradient w.r.t output activations.
- * \param inputs[1]  The input activations.
- * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[0] The gradient w.r.t input activations.
- * \param outputs[1] The gradient w.r.r filter.
- *
- * Abbreviation:
- * w.r.t: with respect to.
- */
-
-template <DeviceType Device>
-class RowConvGradFunc : public FunctionBase {
-  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(3UL, inputs.size());
-    CHECK_EQ(2UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
-          outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-
-    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
-    const auto w = inputs[2];
-    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto wGrad = outputs[1];
-
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == inGrad.shape());
-    CHECK(in.shape() == outGrad.shape());
-    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
-
-    const auto outGMat = outGrad.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    auto inGMat = inGrad.data()
-                      ? inGrad.matrix<Device>()
-                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    auto wGMat = wGrad.data()
-                     ? wGrad.matrix<Device>()
-                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
-  }
-};
-
-REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/RowConvOp.h b/paddle/legacy/function/RowConvOp.h
deleted file mode 100644
index bfe775e01..000000000
--- a/paddle/legacy/function/RowConvOp.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief The forward of row convolution.
- *
- * \param[out] out      The output data and shape is h x d. h is the sum of
- *                      time steps of all samples in one mini-batch.
- * \param[in]  in       The input data and shape is h x d.
- * \param[in]  filter   The filter and shape is k x d. The lookahead step
- *                      number plus one equals k.
- * \param[in]  seq      The sequence start positions.
- *
- */
-template <DeviceType DType>
-void RowConv(typename Tensor<real, DType>::Matrix& out,
-             const typename Tensor<real, DType>::Matrix& in,
-             const typename Tensor<real, DType>::Matrix& filter,
-             const typename Tensor<int, DType>::Vector& seq);
-
-/**
- * \brief The backward of row convolution.
- *
- * \param[in]  outG     The gradient w.r.t output data.
- * \param[in]  in       The input data.
- * \param[in]  filter   The filter.
- * \param[out] inG      The gradient w.r.t input data.
- * \param[out] filterG  The gradient w.r.t filter.
- * \param[in]  seq      The sequence start positions.
- *
- */
-template <DeviceType DType>
-void RowConvGrad(const typename Tensor<real, DType>::Matrix& outG,
-                 const typename Tensor<real, DType>::Matrix& in,
-                 const typename Tensor<real, DType>::Matrix& filter,
-                 typename Tensor<real, DType>::Matrix& inG,
-                 typename Tensor<real, DType>::Matrix& filterG,
-                 const typename Tensor<int, DType>::Vector& seq);
-}  // namespace paddle
diff --git a/paddle/legacy/function/RowConvOpGpu.cu b/paddle/legacy/function/RowConvOpGpu.cu
deleted file mode 100644
index a6d2e4c7e..000000000
--- a/paddle/legacy/function/RowConvOpGpu.cu
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/cuda/include/hl_base.h"
-#include "paddle/legacy/function/RowConvOp.h"
-
-namespace paddle {
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConv(real* y,
-                          const real* x,
-                          const real* w,
-                          const int* starts,
-                          const int height,
-                          const int width,
-                          const int numSeq,
-                          const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context; ++t) {
-        if ((start + j + t) < end) {
-          int xoff = off + t * width;
-          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-          sum += sw[t][tidx] * xVal;
-        }
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConv2(real* y,
-                           const real* x,
-                           const real* w,
-                           const int* starts,
-                           const int height,
-                           const int width,
-                           const int numSeq,
-                           const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      int off = (start + j) * width;
-      real sum = 0;
-      for (int t = 0; t < context && (start + j + t) < end; ++t) {
-        int xoff = off + t * width;
-        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wd * xd;
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,  // NOLINT
-                              const GpuMatrix& in,
-                              const GpuMatrix& filter,
-                              const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  real* y = out.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  dim3 dimBlock(32, 32);
-  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-
-  if (contextLength <= 32) {
-    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  } else {
-    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  }
-  CHECK_SYNC("RowConv");
-}
-
-template <int BLOCK_H, int BLOCK_W, int CONTEXT>
-__global__ void KeRowConvBwWeight(real* dw,
-                                  const real* x,
-                                  const real* dy,
-                                  const int* starts,
-                                  const int height,
-                                  const int width,
-                                  const int numSeq,
-                                  const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_W][BLOCK_H];
-  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
-  __shared__ real sh_dw[CONTEXT][BLOCK_W];
-
-  if (tidy < context) {
-    sh_dw[tidy][tidx] = 0.0;
-  }
-  __syncthreads();
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] =
-          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
-      __syncthreads();
-      if (tidy < (context - 1)) {
-        yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] =
-            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
-      }
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-
-        for (int offset = 16; offset > 0; offset /= 2)
-          val += __shfl_down_sync(mask, val, offset);
-
-        __syncthreads();
-        if (tidx == 0) {
-          sh_dw[t][tidy] += val;
-        }
-        __syncthreads();
-      }
-    }
-  }
-
-  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
-    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwWeight2(real* dw,
-                                   const real* x,
-                                   const real* dy,
-                                   const int* starts,
-                                   const int height,
-                                   const int width,
-                                   const int numSeq,
-                                   const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_H][BLOCK_W];
-  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
-
-  // NOTE(zcd): temporary solution
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, true);
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] =
-            (xoff < width && (yoff - t) >= start && yoff - t < end)
-                ? dy[(yoff - t) * width + xoff]
-                : 0.0;
-        __syncthreads();
-
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-        for (int offset = 16; offset > 0; offset /= 2)
-          val += __shfl_down_sync(mask, val, offset);
-
-        __syncthreads();
-
-        if (tidx == 0 && (gidx + tidy) < width) {
-          dw[t * width + gidx + tidy] += val;
-        }
-      }
-    }
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwData(real* dx,
-                                const real* w,
-                                const real* dy,
-                                const int* starts,
-                                const int height,
-                                const int width,
-                                const int numSeq,
-                                const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        sum += sw[t][tidx] * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConvBwData2(real* dx,
-                                 const real* w,
-                                 const real* dy,
-                                 const int* starts,
-                                 const int height,
-                                 const int width,
-                                 const int numSeq,
-                                 const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wVal * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
-                                  const GpuMatrix& in,
-                                  const GpuMatrix& filter,
-                                  GpuMatrix& inG,      // NOLINT
-                                  GpuMatrix& filterG,  // NOLINT
-                                  const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  const real* dy = outG.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  if (filterG) {
-    dim3 dimBlock(32, 32);
-    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-    real* dw = filterG.getData();
-    if (contextLength <= 32) {
-      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  if (inG) {
-    real* dx = inG.getData();
-    dim3 dimBlock2(32, 32);
-    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
-    if (contextLength <= 64) {
-      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  CHECK_SYNC("RowConvGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/RowConvOpTest.cpp b/paddle/legacy/function/RowConvOpTest.cpp
deleted file mode 100644
index bbc29ad6a..000000000
--- a/paddle/legacy/function/RowConvOpTest.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
-  CpuGpuFuncCompare test("RowConv", FuncConfig());
-
-  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
-  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
-
-  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
-                  ADD_TO);
-
-  test.run();
-}
-
-void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
-  CpuGpuFuncCompare test("RowConvGrad", FuncConfig());
-
-  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
-  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
-  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
-
-  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
-                  ADD_TO);
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}),
-                  ADD_TO);
-
-  test.run();
-}
-
-TEST(RowConv, real) {
-  for (size_t numSamples : {17, 129, 2020}) {
-    for (size_t dim : {16, 512, 2560}) {
-      for (size_t context : {3, 19, 65}) {
-        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
-                << " context length=" << context;
-        testRowConvFw(numSamples, dim, context);
-        testRowConvBw(numSamples, dim, context);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ScaleSubRegionOp.cpp b/paddle/legacy/function/ScaleSubRegionOp.cpp
deleted file mode 100644
index 03a422a74..000000000
--- a/paddle/legacy/function/ScaleSubRegionOp.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionOp.h"
-#include "paddle/legacy/function/TensorShape.h"
-
-namespace paddle {
-
-template <>
-void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
-                                     const real* inputs,
-                                     const real* indices,
-                                     const TensorShape shape,
-                                     const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
-
-  for (int n = 0; n < number; ++n) {
-    // indices start from 1
-    int offset = n * 6;
-    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
-      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
-        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          outputs[idx] *= value;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                                         real* outGrad,
-                                         const real* indices,
-                                         const TensorShape shape,
-                                         const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  for (int n = 0; n < number; ++n) {
-    for (int c = 0; c < channel; ++c) {
-      for (int h = 0; h < height; ++h) {
-        for (int w = 0; w < width; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          int offset = n * 6;
-          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
-              h >= (indices[offset + 2] - 1) &&
-              h <= (indices[offset + 3] - 1) &&
-              w >= (indices[offset + 4] - 1) &&
-              w <= (indices[offset + 5] - 1)) {
-            outGrad[idx] += inGrad[idx] * value;
-          } else {
-            outGrad[idx] += inGrad[idx];
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief For each instance, ScaleSubRegion can be used to multiply a value to
- *        a specified sub continuous region. By providing start index and end
- *        index for C/H/W, you can specify the location and shape of the region.
- *
- * Argument in this Function:
- * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
- * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs   A 4-D tensor with same shape as inputs, output value.
- */
-template <DeviceType Device>
-class ScaleSubRegionFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegion<Device>(outputs[0].data<real>(),
-                           inputs[0].data<real>(),
-                           inputs[1].data<real>(),
-                           shape,
-                           conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of ScaleSubRegion Function.
- *
- * Argument in this Function:
- * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
- * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
- */
-
-template <DeviceType Device>
-class ScaleSubRegionGradFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
-                               outputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               shape,
-                               conf_);
-  }
-
- private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ScaleSubRegionOp.h b/paddle/legacy/function/ScaleSubRegionOp.h
deleted file mode 100644
index ed7d6b8ad..000000000
--- a/paddle/legacy/function/ScaleSubRegionOp.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief Function to multiply a value to values in specified sub continuous
- *        region. Indices must be provided to indcate the location and shape of
- *        the region and the multiplied value is passed by configure variable.
- *
- *
- * \param[out] outputs  Output value.
- * \param[in]  inputs   Input data which contains NCHW information.
- * \param[in]  indices  Indices data to indcate the sub region.
- * \param[in]  shape    Tensor shape of input value.
- * \param[in]  conf     Configure variable which contains the multiplied value.
- */
-template <DeviceType Device>
-void ScaleSubRegion(real* outputs,
-                    const real* inputs,
-                    const real* indices,
-                    const TensorShape shape,
-                    const FuncConfig& conf);
-
-/**
- * \brief Backward propagation function of ScaleSubRegion.
- *
- * \param[out] inGrad   Gradients of previous layer.
- * \param[in]  outGrad  Output gradient.
- * \param[in]  indices  Indices data.
- * \param[in]  shape    The Shape of input tensor.
- * \param[in]  conf     Configure variable.
- */
-template <DeviceType Device>
-void ScaleSubRegionGrad(const real* inGrad,
-                        real* outGrad,
-                        const real* indices,
-                        const TensorShape shape,
-                        const FuncConfig& conf);
-}  // namespace paddle
diff --git a/paddle/legacy/function/ScaleSubRegionOpGpu.cu b/paddle/legacy/function/ScaleSubRegionOpGpu.cu
deleted file mode 100644
index 9784c51ae..000000000
--- a/paddle/legacy/function/ScaleSubRegionOpGpu.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KeScaleSubRegion(real* outputs,
-                                 const real* inputs,
-                                 const real* indices,
-                                 real value,
-                                 int channel,
-                                 int height,
-                                 int width,
-                                 int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int c = (idx / width / height) % channel;
-    const int n = idx / width / height / channel;
-
-    const int offset = n * 6;
-    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
-        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
-        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
-      outputs[idx] = inputs[idx] * value;
-    } else {
-      outputs[idx] = inputs[idx];
-    }
-  }
-}
-
-template <>
-void ScaleSubRegion<DEVICE_TYPE_GPU>(real* outputs,
-                                     const real* inputs,
-                                     const real* indices,
-                                     const TensorShape shape,
-                                     const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  size_t nth = number * channel * height * width;
-  int blockSize = 1024;
-  int gridSize = (nth + blockSize - 1) / blockSize;
-
-  KeScaleSubRegion<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      outputs, inputs, indices, value, channel, height, width, nth);
-  CHECK_SYNC("ScaleSubRegion");
-}
-
-__global__ void KeScaleSubRegionDiff(const real* inGrad,
-                                     real* outGrad,
-                                     const real* indices,
-                                     real value,
-                                     int channel,
-                                     int height,
-                                     int width,
-                                     int nthreads) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % width;
-    const int h = (idx / width) % height;
-    const int c = (idx / width / height) % channel;
-    const int n = idx / width / height / channel;
-
-    const int offset = n * 6;
-    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
-        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
-        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
-      outGrad[idx] += inGrad[idx] * value;
-    } else {
-      outGrad[idx] += inGrad[idx];
-    }
-  }
-}
-
-template <>
-void ScaleSubRegionGrad<DEVICE_TYPE_GPU>(const real* inGrad,
-                                         real* outGrad,
-                                         const real* indices,
-                                         const TensorShape shape,
-                                         const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  size_t nth = number * channel * height * width;
-  int blockSize = 1024;
-  int gridSize = (nth + blockSize - 1) / blockSize;
-
-  KeScaleSubRegionDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      inGrad, outGrad, indices, value, channel, height, width, nth);
-  CHECK_SYNC("ScaleSubRegionGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/ScaleSubRegionOpTest.cpp b/paddle/legacy/function/ScaleSubRegionOpTest.cpp
deleted file mode 100644
index dd6ee6710..000000000
--- a/paddle/legacy/function/ScaleSubRegionOpTest.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(ScaleSubRegion, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {5, 32}) {
-      for (size_t imgSizeH : {5, 33}) {
-        for (size_t imgSizeW : {5, 32}) {
-          for (real value : {-0.5, 0.0, 0.5}) {
-            for (bool firstHalf : {false, true}) {
-              VLOG(3) << " numSamples=" << numSamples
-                      << " channels=" << channels << " imgSizeH=" << imgSizeH
-                      << " imgSizeW=" << imgSizeW;
-
-              for (bool testGrad : {false, true}) {
-                CpuGpuFuncCompare compare(
-                    testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
-                    FuncConfig().set<real>("value", value));
-
-                TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
-                TensorShape indicesShape{numSamples, 6};
-
-                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
-
-                compare.registerInitCallback([=](BufferArg& arg, size_t index) {
-                  if (index == 1) {
-                    real* data = (real*)arg.data();
-
-                    for (size_t i = 0; i < numSamples; ++i) {
-                      size_t offset = i * 6;
-                      data[offset] = firstHalf ? 1 : channels / 2;
-                      data[offset + 1] = firstHalf ? channels / 2 : channels;
-                      data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
-                      data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
-                      data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
-                      data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
-                    }
-                  }
-                });
-
-                compare.addOutputs(
-                    BufferArg(
-                        VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
-                    testGrad ? ADD_TO : ASSIGN_TO);
-                compare.run();
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/SwitchOp.cpp b/paddle/legacy/function/SwitchOp.cpp
deleted file mode 100644
index c6accd180..000000000
--- a/paddle/legacy/function/SwitchOp.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOp.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inC,
-                                const int inH,
-                                const int inW,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < inC; ++c) {
-      for (int h = 0; h < inH; ++h) {
-        for (int w = 0; w < inW; ++w) {
-          if (argType == ADD_TO) {
-            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
-          } else {
-            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <>
-void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inH,
-                                const int inW,
-                                const int inC,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int h = 0; h < inH; ++h) {
-      for (int w = 0; w < inW; ++w) {
-        for (int c = 0; c < inC; ++c) {
-          if (argType == ADD_TO) {
-            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
-          } else {
-            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size,channels, height, width' to
- *         order 'batch_size, height, width, channels'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size,channels, height, width'.
- * \param outputs output data with order 'batch_size, height, width, channels'.
- */
-template <DeviceType Device>
-class NCHW2NHWCFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    NCHW2NHWC<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inC,
-                      inH,
-                      inW,
-                      outputs[0].getArgType());
-  }
-};
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size, height, width, channels' to
- *         order 'batch_size, channels, height, width'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size, height, width, channels'.
- * \param outputs output data with order 'batch_size, channels, height, width'.
- */
-template <DeviceType Device>
-class NHWC2NCHWFunc : public FunctionBase {
- public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inH = inputs[0].shape()[1];
-    size_t inW = inputs[0].shape()[2];
-    size_t inC = inputs[0].shape()[3];
-
-    NHWC2NCHW<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inH,
-                      inW,
-                      inC,
-                      outputs[0].getArgType());
-  }
-};
-
-REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/SwitchOp.h b/paddle/legacy/function/SwitchOp.h
deleted file mode 100644
index b5eb0883c..000000000
--- a/paddle/legacy/function/SwitchOp.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/**
- * \brief  This funtion switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order 'batch_size,
- *channels, height, width' to
- *         order 'batch_size, height, width, channels'.
- *
- * \param[out] outputs save results.
- * \param[in]  inputs  input data.
- * \param[in]  num     batch size of input data.
- * \param[in]  inC     channel number of input data.
- * \param[in]  inH     height of input data.
- * \param[in]  inH     with of input data.
- * \param[in]  argType     type of output argument.
- */
-template <DeviceType Device>
-void NCHW2NHWC(real* outputs,
-               const real* inputs,
-               const int num,
-               const int inC,
-               const int inH,
-               const int inW,
-               const int argtype);
-
-/**
- * \brief  This funtion switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order 'batch_size,
- *height, width, channels' to
- *         order 'batch_size, channels, height, width'.
- *
- * \param[out] inGrad  gradients of previous layer.
- * \param[in]  outGrad output gradients.
- * \param[in]  num     batch size of input data.
- * \param[in]  inH     height of input data.
- * \param[in]  inW     with of input data.
- * \param[in]  inC     channel number of input data.
- * \param[in]  argType     type of output argument.
- */
-template <DeviceType Device>
-void NHWC2NCHW(real* inGrad,
-               const real* outGrad,
-               const int num,
-               const int inH,
-               const int inW,
-               const int inC,
-               const int argType);
-}  // namespace paddle
diff --git a/paddle/legacy/function/SwitchOpGpu.cu b/paddle/legacy/function/SwitchOpGpu.cu
deleted file mode 100644
index 45390a56c..000000000
--- a/paddle/legacy/function/SwitchOpGpu.cu
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 Paddle
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-__global__ void KeNCHW2NHWC(real* outputs,
-                            const real* inputs,
-                            int inC,
-                            int inH,
-                            int inW,
-                            int nthreads,
-                            int argType) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int w = idx % inW;
-    const int h = (idx / inW) % inH;
-    const int c = (idx / inW / inH) % inC;
-    const int n = idx / inW / inH / inC;
-
-    const int off = ((n * inH + h) * inW + w) * inC + c;
-    if (argType == ADD_TO) {
-      outputs[off] += inputs[idx];
-    } else {
-      outputs[off] = inputs[idx];
-    }
-  }
-}
-
-template <>
-void NCHW2NHWC<DEVICE_TYPE_GPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inC,
-                                const int inH,
-                                const int inW,
-                                const int argType) {
-  size_t nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + 1024 - 1) / 1024;
-  KeNCHW2NHWC<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      outputs, inputs, inC, inH, inW, nth, argType);
-  CHECK_SYNC("NCHW2NHWC");
-}
-
-__global__ void KeNHWC2NCHW(real* outputs,
-                            const real* inputs,
-                            int inH,
-                            int inW,
-                            int inC,
-                            int nthreads,
-                            int argType) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < nthreads) {
-    const int c = idx % inC;
-    const int w = (idx / inC) % inW;
-    const int h = (idx / inC / inW) % inH;
-    const int n = idx / inW / inH / inC;
-
-    const int off = ((n * inC + c) * inH + h) * inW + w;
-    if (argType == ADD_TO) {
-      outputs[off] += inputs[idx];
-    } else {
-      outputs[off] = inputs[idx];
-    }
-  }
-}
-
-template <>
-void NHWC2NCHW<DEVICE_TYPE_GPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inH,
-                                const int inW,
-                                const int inC,
-                                const int argType) {
-  int nth = num * inC * inH * inW;
-  int blockSize = 1024;
-  int gridSize = (nth + 1024 - 1) / 1024;
-  KeNHWC2NCHW<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-      outputs, inputs, inH, inW, inC, nth, argType);
-  CHECK_SYNC("NHWC2NCHW");
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/SwitchOpTest.cpp b/paddle/legacy/function/SwitchOpTest.cpp
deleted file mode 100644
index 08e5a613c..000000000
--- a/paddle/legacy/function/SwitchOpTest.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-
-namespace paddle {
-
-TEST(Pad, real) {
-  for (size_t numSamples : {1, 4, 8, 16}) {
-    for (size_t channels : {1, 4, 8, 16}) {
-      for (size_t imgSizeH : {1, 4, 8, 16}) {
-        for (size_t imgSizeW : {1, 4, 8, 16}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          for (bool test_grad : {true, false}) {
-            CpuGpuFuncCompare compare(test_grad ? "NHWC2NCHW" : "NCHW2NHWC",
-                                      FuncConfig());
-            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
-            TensorShape outDims{numSamples, imgSizeH, imgSizeW, channels};
-            compare.addInputs(
-                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
-            compare.addOutputs(BufferArg(
-                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
-            compare.run();
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/TensorShape.h b/paddle/legacy/function/TensorShape.h
deleted file mode 100644
index d4d1eae39..000000000
--- a/paddle/legacy/function/TensorShape.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-namespace paddle {
-
-/**
- * TensorShape used to represent shape of normal tensor.
- */
-class TensorShape {
- public:
-  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
-
-  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
-
-  TensorShape(std::initializer_list<size_t> dims) {
-    ndims_ = dims.size();
-    initDims(ndims_);
-    dims_.assign(dims);
-    numElements();
-  };
-
-  TensorShape(const TensorShape& t)
-      : ndims_(t.ndims_), nelements_(t.nelements_) {
-    initDims(ndims_);
-    dims_.assign(t.dims_.begin(), t.dims_.end());
-  };
-
-  // get the size of specified dimension
-  size_t operator[](size_t dim) const {
-    CHECK_GE(dim, (size_t)0);
-    CHECK_LT(dim, ndims_);
-    return dims_[dim];
-  }
-
-  // set the size of specified dimension
-  void setDim(size_t dim, size_t size) {
-    CHECK_GE(dim, (size_t)0);
-    CHECK_LT(dim, ndims_);
-    dims_[dim] = size;
-    numElements();
-  }
-
-  void reshape(std::initializer_list<size_t> dims) {
-    ndims_ = dims.size();
-    if (ndims_ > kMinDims) {
-      dims_.resize(ndims_);
-    }
-    dims_.assign(dims);
-    numElements();
-  }
-
-  // number of dimensions of the tensor
-  size_t ndims() const { return ndims_; }
-
-  size_t getElements() const { return nelements_; }
-
-  bool operator==(const TensorShape& t) const {
-    if (ndims() != t.ndims()) return false;
-    for (size_t i = 0; i < ndims(); i++) {
-      if (dims_[i] != t.dims_[i]) return false;
-    }
-
-    return true;
-  }
-
-  bool operator!=(const TensorShape& t) const { return !(*this == t); }
-
- private:
-  // compute number of elements
-  void numElements() {
-    nelements_ = 1;
-    for (size_t n = 0; n < ndims_; n++) {
-      nelements_ *= dims_[n];
-    }
-  }
-
-  // init dims_
-  void initDims(size_t ndims) {
-    size_t count = ndims < kMinDims ? kMinDims : ndims;
-    dims_.assign(count, 1);
-  }
-
-  // number of dimensions
-  // ndims_ may be not equeal dims_.size()
-  size_t ndims_;
-  // number of elements
-  size_t nelements_;
-  std::vector<size_t> dims_;
-  static const size_t kMinDims = 4;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/TensorShapeTest.cpp b/paddle/legacy/function/TensorShapeTest.cpp
deleted file mode 100644
index 4d692b9b9..000000000
--- a/paddle/legacy/function/TensorShapeTest.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TensorShape.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-
-TEST(TensorShape, Constructor) {
-  TensorShape t1;
-  EXPECT_EQ(t1.ndims(), 0U);
-  EXPECT_EQ(t1.getElements(), 0U);
-
-  TensorShape t2(3);
-  EXPECT_EQ(t2.ndims(), 3U);
-  EXPECT_EQ(t2.getElements(), 1U);
-
-  TensorShape t3({8, 10});
-  EXPECT_EQ(t3.ndims(), 2U);
-  EXPECT_EQ(t3.getElements(), 80U);
-
-  TensorShape t4(t3);
-  EXPECT_EQ(t4.ndims(), t3.ndims());
-  EXPECT_EQ(t4.getElements(), t3.getElements());
-
-  TensorShape t5({1, 2, 3, 4, 5});
-  EXPECT_EQ(t5.ndims(), 5U);
-  EXPECT_EQ(t5.getElements(), 120U);
-}
-
-TEST(TensorShape, GetAndSet) {
-  TensorShape t({1, 2, 3});
-  EXPECT_EQ(t.ndims(), 3U);
-  EXPECT_EQ(t.getElements(), 6U);
-
-  EXPECT_EQ(t[1], 2U);
-  t.setDim(1, 100);
-  EXPECT_EQ(t.getElements(), 300U);
-  EXPECT_EQ(t[1], 100U);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/TensorType.h b/paddle/legacy/function/TensorType.h
deleted file mode 100644
index 13994821b..000000000
--- a/paddle/legacy/function/TensorType.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-enum ValueType {
-  VALUE_TYPE_INT32 = 0,
-  VALUE_TYPE_FLOAT = 1,
-  VALUE_TYPE_DOUBLE = 2,
-  VALUE_TYPE_BYTE = 3
-};
-
-enum DeviceType {
-  DEVICE_TYPE_UNSPECIFIED = 0,
-  DEVICE_TYPE_CPU = 1,
-  DEVICE_TYPE_GPU = 2
-};
-
-enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
-
-enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
-
-inline int sizeOfValuType(ValueType valueType) {
-  if (valueType == VALUE_TYPE_INT32) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_FLOAT) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_DOUBLE) {
-    return 8;
-  } else {
-    LOG(FATAL) << "Unknown type: " << valueType;
-    return 0;
-  }
-}
-
-template <typename T>
-struct DataType;
-
-template <>
-struct DataType<float> {
-  static const ValueType value = VALUE_TYPE_FLOAT;
-};
-
-template <>
-struct DataType<double> {
-  static const ValueType value = VALUE_TYPE_DOUBLE;
-};
-
-template <>
-struct DataType<int> {
-  static const ValueType value = VALUE_TYPE_INT32;
-};
-
-namespace detail {
-
-template <typename VType, DeviceType Device>
-struct MatrixT;
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct SparseMatrixT;
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct VectorT;
-
-template <>
-struct VectorT<real, DEVICE_TYPE_CPU> {
-  using type = CpuVector;
-};
-
-template <>
-struct VectorT<real, DEVICE_TYPE_GPU> {
-  using type = GpuVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_CPU> {
-  using type = CpuIVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_GPU> {
-  using type = GpuIVector;
-};
-
-}  // namespace detail
-
-template <typename VType, DeviceType DType>
-struct Tensor {
-  typedef typename detail::VectorT<VType, DType>::type Vector;
-  typedef typename detail::MatrixT<VType, DType>::type Matrix;
-  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/TensorTypeTest.cpp b/paddle/legacy/function/TensorTypeTest.cpp
deleted file mode 100644
index d0cd63147..000000000
--- a/paddle/legacy/function/TensorTypeTest.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TensorType.h"
-#include <gtest/gtest.h>
-
-namespace paddle {
-
-TEST(TensorType, Matrix) {
-  Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
-  EXPECT_EQ(matrix.getHeight(), 100U);
-  EXPECT_EQ(matrix.getWidth(), 200U);
-  EXPECT_EQ(matrix.getElementCnt(), 100U * 200U);
-  EXPECT_EQ(matrix.useGpu(), false);
-
-  Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
-  EXPECT_EQ(testGpu.useGpu(), true);
-}
-
-TEST(TensorType, Vector) {
-  Tensor<real, DEVICE_TYPE_CPU>::Vector cpuVector(100);
-  Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
-  EXPECT_EQ(cpuVector.useGpu(), false);
-  EXPECT_EQ(gpuVector.useGpu(), true);
-  EXPECT_EQ(cpuVector.getSize(), 100U);
-  EXPECT_EQ(gpuVector.getSize(), 100U);
-
-  Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
-  Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
-  EXPECT_EQ(cpuIVector.useGpu(), false);
-  EXPECT_EQ(gpuIVector.useGpu(), true);
-  EXPECT_EQ(cpuIVector.getSize(), 100U);
-  EXPECT_EQ(gpuIVector.getSize(), 100U);
-}
-
-TEST(TensorType, EmptyMatrix) {
-  CpuMatrix empty(nullptr, 0, 0);
-  CpuMatrix nonEmpty(10, 10);
-  EXPECT_EQ(empty.isEmpty(), true);
-  EXPECT_EQ(nonEmpty.isEmpty(), false);
-  CHECK(nonEmpty);
-  auto function = [](const CpuMatrix& matrix) {
-    if (matrix) {
-      EXPECT_NE(matrix.getData(), nullptr);
-    } else {
-      EXPECT_EQ(matrix.getData(), nullptr);
-    }
-  };
-  function(empty);
-  function(nonEmpty);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConv.cpp b/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
deleted file mode 100644
index 6179635a9..000000000
--- a/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/legacy/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input
-    float* inputPadding = inputData;
-    int padInputHeight = inputHeight + 2 * paddingH();
-    int padInputWidth = inputWidth + 2 * paddingW();
-    int newSize =
-        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
-
-    resizeBuffer<Device>(newSize);
-    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-    neon::Padding<float>::run(inputData,
-                              inputPadding,
-                              batchSize * inputChannels,
-                              inputHeight,
-                              inputWidth,
-                              padInputHeight,
-                              padInputWidth);
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 3 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
-    } else if (filterWidth == 4 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else if (filterWidth == 4 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConv.h b/paddle/legacy/function/neon/NeonDepthwiseConv.h
deleted file mode 100644
index 8b2cba263..000000000
--- a/paddle/legacy/function/neon/NeonDepthwiseConv.h
+++ /dev/null
@@ -1,627 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string.h>
-#include "neon_util.h"
-
-namespace paddle {
-namespace neon {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <int filterSize, int stride>
-struct DepthwiseConvKernel {};
-
-inline float32_t conv3x3(const float* r0,
-                         const float* r1,
-                         const float* r2,
-                         float32x4_t k0,
-                         float32x4_t k1,
-                         float32x4_t k2) {
-  float32_t tmp[12];
-  vst1q_f32(&(tmp[0]), k0);
-  vst1q_f32(&(tmp[4]), k1);
-  vst1q_f32(&(tmp[8]), k2);
-  float32_t sum0 = r0[0] * tmp[0] + r0[1] * tmp[1] + r0[2] * tmp[2];
-  float32_t sum1 = r1[0] * tmp[4] + r1[1] * tmp[5] + r1[2] * tmp[6];
-  float32_t sum2 = r2[0] * tmp[8] + r2[1] * tmp[9] + r2[2] * tmp[10];
-  return sum0 + sum1 + sum2;
-}
-
-inline float32_t conv4x4(float32x4_t r0,
-                         float32x4_t r1,
-                         float32x4_t r2,
-                         float32x4_t r3,
-                         float32x4_t k0,
-                         float32x4_t k1,
-                         float32x4_t k2,
-                         float32x4_t k3) {
-  float32x4_t tmp;
-  tmp = vmulq_f32(r0, k0);
-  tmp = vmlaq_f32(tmp, r1, k1);
-  tmp = vmlaq_f32(tmp, r2, k2);
-  tmp = vmlaq_f32(tmp, r3, k3);
-  return vaddvq_f32(tmp);
-}
-
-/**
- * Each step calculates four elements of the output.
- * First step:
- *   R0[0, 1, 2, 3...] * K[0][0]
- *   R0[1, 2, 3, 4...] * K[0][1]
- *   R0[2, 3, 4, 5...] * K[0][2]
- *   R1[0, 1, 2, 3...] * K[1][0]
- *   R1[1, 2, 3, 4...] * K[1][1]
- *   R1[2, 3, 4, 5...] * K[1][2]
- *   R2[0, 1, 2, 3...] * K[2][0]
- *   R2[1, 2, 3, 4...] * K[2][1]
- * + R2[2, 3, 4, 5...] * K[2][2]
- * ------------------------------
- *     Output[0, 1, 2, 3]
- */
-template <>
-struct DepthwiseConvKernel<3, 1> {
-  static void run(const float* inputData,
-                  const float* filterData,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int filterMultiplier,
-                  float* outputData) {
-    const int steps = outputWidth >> 2;
-    const int remain = outputWidth & 3;
-    for (int c = 0; c < outputChannels; c++, filterData += 9) {
-      // Load the filters
-      float32x4_t k[3];
-      k[0] = vld1q_f32(filterData);
-      k[1] = vld1q_f32(filterData + 3);
-      k[2] = vld1q_f32(filterData + 6);
-      k[0] = vsetq_lane_f32(0.f, k[0], 3);
-      k[1] = vsetq_lane_f32(0.f, k[1], 3);
-      k[2] = vsetq_lane_f32(0.f, k[2], 3);
-
-      const float* r0 =
-          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
-      const float* r1 = r0 + inputWidth;
-      const float* r2 = r0 + inputWidth * 2;
-      float32x4_t input[3][3];
-      for (int h = 0; h < outputHeight; h++) {
-        for (int s = 0; s < steps; s++) {
-          // Load the inputs
-          float32x4_t tmp;
-          input[0][0] = vld1q_f32(r0);
-          tmp = vld1q_f32(r0 + 4);
-          input[0][1] = vextq_f32(input[0][0], tmp, 1);
-          input[0][2] = vextq_f32(input[0][0], tmp, 2);
-          input[1][0] = vld1q_f32(r1);
-          tmp = vld1q_f32(r1 + 4);
-          input[1][1] = vextq_f32(input[1][0], tmp, 1);
-          input[1][2] = vextq_f32(input[1][0], tmp, 2);
-          input[2][0] = vld1q_f32(r2);
-          tmp = vld1q_f32(r2 + 4);
-          input[2][1] = vextq_f32(input[2][0], tmp, 1);
-          input[2][2] = vextq_f32(input[2][0], tmp, 2);
-
-          float32x4_t tmp1 = vdupq_n_f32(0.f);
-          float32x4_t tmp2 = vdupq_n_f32(0.f);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
-          tmp1 = vaddq_f32(tmp1, tmp2);
-
-          vst1q_f32(outputData, tmp1);
-          r0 += 4;
-          r1 += 4;
-          r2 += 4;
-          outputData += 4;
-        }
-
-        for (int r = 0; r < remain; r++) {
-          *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
-          r0++;
-          r1++;
-          r2++;
-          outputData++;
-        }
-
-        r0 += 2;
-        r1 += 2;
-        r2 += 2;
-      }
-    }
-  }
-};
-
-/**
- * Each step calculates four elements of the output.
- * First step:
- *   R0[0, 2, 4, 6...] * K[0][0]
- *   R0[1, 3, 5, 7...] * K[0][1]
- *   R0[2, 4, 6, 8...] * K[0][2]
- *   R1[0, 2, 4, 6...] * K[1][0]
- *   R1[1, 3, 5, 7...] * K[1][1]
- *   R1[2, 4, 6, 8...] * K[1][2]
- *   R2[0, 2, 4, 6...] * K[2][0]
- *   R2[1, 3, 5, 7...] * K[2][1]
- *   R2[2, 4, 6, 8...] * K[2][2]
- * ------------------------------
- *     Output[0, 1, 2, 3]
- */
-template <>
-struct DepthwiseConvKernel<3, 2> {
-  static void run(const float* inputData,
-                  const float* filterData,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int filterMultiplier,
-                  float* outputData) {
-    const int steps = outputWidth >> 2;
-    const int remain = outputWidth & 3;
-    for (int c = 0; c < outputChannels; c++, filterData += 9) {
-      // Load the filters
-      float32x4_t k[3];
-      k[0] = vld1q_f32(filterData);
-      k[1] = vld1q_f32(filterData + 3);
-      k[2] = vld1q_f32(filterData + 6);
-      k[0] = vsetq_lane_f32(0.f, k[0], 3);
-      k[1] = vsetq_lane_f32(0.f, k[1], 3);
-      k[2] = vsetq_lane_f32(0.f, k[2], 3);
-
-      const float* start =
-          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
-      float32x4_t input[3][3];
-      for (int h = 0; h < outputHeight; h++) {
-        const float* r0 = start + 2 * h * inputWidth;
-        const float* r1 = start + (2 * h + 1) * inputWidth;
-        const float* r2 = start + (2 * h + 2) * inputWidth;
-        for (int s = 0; s < steps; s++) {
-          // Load the inputs
-          float32x4_t data1;
-          float32x4x2_t data2;
-
-          data2 = vld2q_f32(r0);
-          input[0][0] = data2.val[0];
-          input[0][1] = data2.val[1];
-          data1 = vld1q_f32(r0 + 8);
-          input[0][2] = vextq_f32(data2.val[0], data1, 1);
-
-          data2 = vld2q_f32(r1);
-          input[1][0] = data2.val[0];
-          input[1][1] = data2.val[1];
-          data1 = vld1q_f32(r1 + 8);
-          input[1][2] = vextq_f32(data2.val[0], data1, 1);
-
-          data2 = vld2q_f32(r2);
-          input[2][0] = data2.val[0];
-          input[2][1] = data2.val[1];
-          data1 = vld1q_f32(r2 + 8);
-          input[2][2] = vextq_f32(data2.val[0], data1, 1);
-
-          float32x4_t tmp1 = vdupq_n_f32(0.f);
-          float32x4_t tmp2 = vdupq_n_f32(0.f);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
-          tmp1 = vaddq_f32(tmp1, tmp2);
-
-          vst1q_f32(outputData, tmp1);
-          r0 += 8;
-          r1 += 8;
-          r2 += 8;
-          outputData += 4;
-        }
-
-        for (int r = 0; r < remain; r++) {
-          *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
-          r0 += 2;
-          r1 += 2;
-          r2 += 2;
-          outputData++;
-        }
-      }
-    }
-  }
-};
-
-/**
- * Each step calculates four elements of the output.
- */
-template <>
-struct DepthwiseConvKernel<4, 1> {
-  static void run(const float* inputData,
-                  const float* filterData,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int filterMultiplier,
-                  float* outputData) {
-    const int steps = outputWidth >> 2;
-    const int remain = outputWidth & 3;
-    for (int c = 0; c < outputChannels; c++, filterData += 16) {
-      // Load the filters
-      float32x4_t k[4];
-      k[0] = vld1q_f32(filterData);
-      k[1] = vld1q_f32(filterData + 4);
-      k[2] = vld1q_f32(filterData + 8);
-      k[3] = vld1q_f32(filterData + 12);
-
-      const float* r0 =
-          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
-      const float* r1 = r0 + inputWidth;
-      const float* r2 = r0 + inputWidth * 2;
-      const float* r3 = r0 + inputWidth * 3;
-      float32x4_t input[4][4];
-      for (int h = 0; h < outputHeight; h++) {
-        for (int s = 0; s < steps; s++) {
-          // Load the inputs
-          float32x4_t tmp;
-          input[0][0] = vld1q_f32(r0);
-          tmp = vld1q_f32(r0 + 4);
-          input[0][1] = vextq_f32(input[0][0], tmp, 1);
-          input[0][2] = vextq_f32(input[0][0], tmp, 2);
-          input[0][3] = vextq_f32(input[0][0], tmp, 3);
-
-          input[1][0] = vld1q_f32(r1);
-          tmp = vld1q_f32(r1 + 4);
-          input[1][1] = vextq_f32(input[1][0], tmp, 1);
-          input[1][2] = vextq_f32(input[1][0], tmp, 2);
-          input[1][3] = vextq_f32(input[1][0], tmp, 3);
-
-          input[2][0] = vld1q_f32(r2);
-          tmp = vld1q_f32(r2 + 4);
-          input[2][1] = vextq_f32(input[2][0], tmp, 1);
-          input[2][2] = vextq_f32(input[2][0], tmp, 2);
-          input[2][3] = vextq_f32(input[2][0], tmp, 3);
-
-          input[3][0] = vld1q_f32(r3);
-          tmp = vld1q_f32(r3 + 4);
-          input[3][1] = vextq_f32(input[3][0], tmp, 1);
-          input[3][2] = vextq_f32(input[3][0], tmp, 2);
-          input[3][3] = vextq_f32(input[3][0], tmp, 3);
-
-          float32x4_t tmp1 = vdupq_n_f32(0.f);
-          float32x4_t tmp2 = vdupq_n_f32(0.f);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
-          tmp1 = vaddq_f32(tmp1, tmp2);
-
-          vst1q_f32(outputData, tmp1);
-          r0 += 4;
-          r1 += 4;
-          r2 += 4;
-          r3 += 4;
-          outputData += 4;
-        }
-
-        for (int r = 0; r < remain; r++) {
-          float32x4_t i0 = vld1q_f32(r0);
-          float32x4_t i1 = vld1q_f32(r1);
-          float32x4_t i2 = vld1q_f32(r2);
-          float32x4_t i3 = vld1q_f32(r3);
-          *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
-          r0++;
-          r1++;
-          r2++;
-          r3++;
-          outputData++;
-        }
-
-        r0 += 3;
-        r1 += 3;
-        r2 += 3;
-        r3 += 3;
-      }
-    }
-  }
-};
-
-/**
- * Each step calculates four elements of the output.
- */
-template <>
-struct DepthwiseConvKernel<4, 2> {
-  static void run(const float* inputData,
-                  const float* filterData,
-                  int inputHeight,
-                  int inputWidth,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int filterMultiplier,
-                  float* outputData) {
-    const int steps = outputWidth >> 2;
-    const int remain = outputWidth & 3;
-    for (int c = 0; c < outputChannels; c++, filterData += 16) {
-      // Load the filters
-      float32x4_t k[4];
-      k[0] = vld1q_f32(filterData);
-      k[1] = vld1q_f32(filterData + 4);
-      k[2] = vld1q_f32(filterData + 8);
-      k[3] = vld1q_f32(filterData + 12);
-
-      const float* start =
-          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
-      float32x4_t input[4][4];
-      for (int h = 0; h < outputHeight; h++) {
-        const float* r0 = start + 2 * h * inputWidth;
-        const float* r1 = start + (2 * h + 1) * inputWidth;
-        const float* r2 = start + (2 * h + 2) * inputWidth;
-        const float* r3 = start + (2 * h + 3) * inputWidth;
-        for (int s = 0; s < steps; s++) {
-          // Load the inputs
-          float32x4x2_t data1;
-          float32x4x2_t data2;
-
-          data1 = vld2q_f32(r0);
-          data2 = vld2q_f32(r0 + 8);
-          input[0][0] = data1.val[0];
-          input[0][1] = data1.val[1];
-          input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1);
-          input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1);
-
-          data1 = vld2q_f32(r1);
-          data2 = vld2q_f32(r1 + 8);
-          input[1][0] = data1.val[0];
-          input[1][1] = data1.val[1];
-          input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1);
-          input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1);
-
-          data1 = vld2q_f32(r2);
-          data2 = vld2q_f32(r2 + 8);
-          input[2][0] = data1.val[0];
-          input[2][1] = data1.val[1];
-          input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1);
-          input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1);
-
-          data1 = vld2q_f32(r3);
-          data2 = vld2q_f32(r3 + 8);
-          input[3][0] = data1.val[0];
-          input[3][1] = data1.val[1];
-          input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1);
-          input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1);
-
-          float32x4_t tmp1 = vdupq_n_f32(0.f);
-          float32x4_t tmp2 = vdupq_n_f32(0.f);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
-          tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
-          tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
-          tmp1 = vaddq_f32(tmp1, tmp2);
-
-          vst1q_f32(outputData, tmp1);
-          r0 += 8;
-          r1 += 8;
-          r2 += 8;
-          r3 += 8;
-          outputData += 4;
-        }
-
-        for (int r = 0; r < remain; r++) {
-          float32x4_t i0 = vld1q_f32(r0);
-          float32x4_t i1 = vld1q_f32(r1);
-          float32x4_t i2 = vld1q_f32(r2);
-          float32x4_t i3 = vld1q_f32(r3);
-          *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
-          r0 += 2;
-          r1 += 2;
-          r2 += 2;
-          r3 += 2;
-          outputData++;
-        }
-      }
-    }
-  }
-};
-
-template <class T>
-struct Padding {
-  static void run(const T* input,
-                  T* inputPadding,
-                  int channels,
-                  int inputHeight,
-                  int inputWidth,
-                  int padInputHeight,
-                  int padInputWidth) {
-    const int paddingHeight = (padInputHeight - inputHeight) / 2;
-    const int paddingWidth = (padInputWidth - inputWidth) / 2;
-    for (int c = 0; c < channels; c++) {
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-
-      for (int i = 0; i < inputHeight; i++) {
-        // padding head
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = T(0);
-        }
-
-        memcpy(inputPadding, input, inputWidth * sizeof(T));
-        inputPadding += inputWidth;
-        input += inputWidth;
-
-        // padding tail
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = T(0);
-        }
-      }
-
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-    }
-  }
-};
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <>
-struct Padding<float> {
-  static void run(const float* input,
-                  float* inputPadding,
-                  int channels,
-                  int inputHeight,
-                  int inputWidth,
-                  int padInputHeight,
-                  int padInputWidth) {
-    const int paddingHeight = (padInputHeight - inputHeight) / 2;
-    const int paddingWidth = (padInputWidth - inputWidth) / 2;
-    for (int c = 0; c < channels; c++) {
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-
-      for (int i = 0; i < inputHeight; i++) {
-        // padding head
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = float(0);
-        }
-
-        int step = inputWidth >> 2;
-        int remain = inputWidth & 3;
-        for (int s = 0; s < step; s++) {
-          float32x4_t s0 = vld1q_f32(input);
-          vst1q_f32(inputPadding, s0);
-          input += 4;
-          inputPadding += 4;
-        }
-        for (int r = 0; r < remain; r++) {
-          *inputPadding++ = *input++;
-        }
-
-        // padding tail
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = float(0);
-        }
-      }
-
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-    }
-  }
-};
-
-// for stride is 2
-struct StridePadding {
-  static void run(const float* input,
-                  float* inputPadding,
-                  int channels,
-                  int inputHeight,
-                  int inputWidth,
-                  int padInputHeight,
-                  int padInputWidth) {
-    const int paddingHeight = (padInputHeight - (inputHeight * 2 - 1)) / 2;
-    const int paddingWidth = (padInputWidth - (inputWidth * 2 - 1)) / 2;
-    for (int c = 0; c < channels; c++) {
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-
-      for (int i = 0; i < inputHeight; i++) {
-        // padding head
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = float(0);
-        }
-
-        int step = inputWidth >> 2;
-        int remain = inputWidth & 3;
-        float32x4_t s1 = vdupq_n_f32(0.f);
-        for (int s = 0; s < step; s++) {
-          float32x4_t s0 = vld1q_f32(input);
-          float32x4x2_t v = {{s0, s1}};
-          vst2q_f32(inputPadding, v);
-          input += 4;
-          inputPadding += 8;
-        }
-        for (int r = 0; r < remain; r++) {
-          *inputPadding++ = *input++;
-          *inputPadding++ = float(0);
-        }
-        inputPadding--;
-
-        // padding tail
-        for (int j = 0; j < paddingWidth; j++) {
-          *inputPadding++ = float(0);
-        }
-        if (i != inputHeight - 1) {
-          memset(inputPadding, 0, padInputWidth * sizeof(float));
-          inputPadding += padInputWidth;
-        }
-      }
-
-      if (paddingHeight > 0) {
-        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
-        inputPadding += padInputWidth * paddingHeight;
-      }
-    }
-  }
-};
-
-#endif
-
-#endif
-
-}  // namespace neon
-}  // namespace paddle
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
deleted file mode 100644
index feb77e1ff..000000000
--- a/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/legacy/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(paddingH(), paddingW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input, input -> inputPadding
-    float* inputPadding = inputData;
-    int padInputHeight =
-        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
-    int padInputWidth =
-        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
-
-    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
-      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-      resizeBuffer<Device>(newSize);
-      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-      if (strideH() == 1) {
-        neon::Padding<float>::run(inputData,
-                                  inputPadding,
-                                  batchSize * inputChannels,
-                                  inputHeight,
-                                  inputWidth,
-                                  padInputHeight,
-                                  padInputWidth);
-      } else if (strideH() == 2) {
-        neon::StridePadding::run(inputData,
-                                 inputPadding,
-                                 batchSize * inputChannels,
-                                 inputHeight,
-                                 inputWidth,
-                                 padInputHeight,
-                                 padInputWidth);
-      } else {
-        LOG(FATAL) << "Not supported";
-      }
-    }
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 4) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
-                    CPU,
-                    NeonDepthwiseConvTransposeFunction);
-
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/neon/neon_util.h b/paddle/legacy/function/neon/neon_util.h
deleted file mode 100644
index 95076b138..000000000
--- a/paddle/legacy/function/neon/neon_util.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-
-namespace paddle {
-
-namespace neon {
-
-inline float32x4_t vld1q_f32_aligned(const float* p) {
-  return vld1q_f32(
-      (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
-}
-
-#ifndef __aarch64__
-inline float32_t vaddvq_f32(float32x4_t a) {
-  float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
-  return vget_lane_f32(vpadd_f32(v, v), 0);
-}
-
-#define vmlaq_laneq_f32(a, b, v, lane) \
-  vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane))
-#endif
-
-}  // namespace neon
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOp.cpp b/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
deleted file mode 100644
index 81c832e77..000000000
--- a/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "nnpack.h"
-#include "paddle/legacy/function/ConvOp.h"
-
-DEFINE_bool(nnpack_allocate_outside,
-            true,
-            "Allocate and free workspace memory outside the NNPACK interface.");
-DEFINE_int32(nnpack_num_threads,
-             0,
-             "The number of nnpack threads"
-             "default: 0; 0 to disable threadpool.");
-
-namespace paddle {
-
-nnp_convolution_algorithm get_nnp_convolution_algorithm(
-    const std::string& algorithm) {
-  if (algorithm == "auto") {
-    return nnp_convolution_algorithm_auto;
-  } else if (algorithm == "ft8x8") {
-    return nnp_convolution_algorithm_ft8x8;
-  } else if (algorithm == "ft16x16") {
-    return nnp_convolution_algorithm_ft16x16;
-  } else if (algorithm == "wt8x8") {
-    return nnp_convolution_algorithm_wt8x8;
-  } else if (algorithm == "implicit-gemm") {
-    return nnp_convolution_algorithm_implicit_gemm;
-  } else if (algorithm == "direct") {
-    return nnp_convolution_algorithm_direct;
-  } else {
-    return nnp_convolution_algorithm_auto;
-  }
-}
-
-template <DeviceType Device>
-class NNPACKConvFunction : public ConvFunctionBase {
- public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
-    transform_strategy_ = nnp_convolution_transform_strategy_compute;
-    nnp_status status = nnp_initialize();
-    CHECK_EQ(status, nnp_status_success);
-    workspaceBuffer_ = nullptr;
-    workspaceSize_ = 0;
-
-    create_nnpack_threadpool();
-  }
-
-  ~NNPACKConvFunction() {
-    if (workspaceBuffer_) {
-      free(workspaceBuffer_);
-    }
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    check(inputs, outputs);
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
-    nnp_padding padding = {.top = (size_t)paddingH(),
-                           .right = (size_t)paddingW(),
-                           .bottom = (size_t)paddingH(),
-                           .left = (size_t)paddingW()};
-    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
-    nnp_size outputSubsampling = {.width = (size_t)strideW(),
-                                  .height = (size_t)strideH()};
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    void* bufferPtr = nullptr;
-    size_t* sizePtr = nullptr;
-    size_t needSize;
-    if (FLAGS_nnpack_allocate_outside) {
-      if (batchSize == 1) {
-        nnp_status status = nnp_convolution_inference(algorithm_,
-                                                      transform_strategy_,
-                                                      inputChannels,
-                                                      outputChannels,
-                                                      inputSize,
-                                                      padding,
-                                                      kernelSize,
-                                                      outputSubsampling,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      &needSize,
-                                                      nnp_activation_identity,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      } else {
-        // only supports stride = 1
-        CHECK_EQ(strideH(), 1);
-        CHECK_EQ(strideW(), 1);
-        nnp_status status = nnp_convolution_output(algorithm_,
-                                                   batchSize,
-                                                   inputChannels,
-                                                   outputChannels,
-                                                   inputSize,
-                                                   padding,
-                                                   kernelSize,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   &needSize,
-                                                   nnp_activation_identity,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-
-      VLOG(3) << "workspace size is " << needSize;
-      if (needSize > workspaceSize_) {
-        workspaceSize_ = needSize;
-        if (workspaceBuffer_) {
-          free(workspaceBuffer_);
-        } else {
-          posix_memalign(&workspaceBuffer_, 64, needSize);
-        }
-      }
-
-      if (needSize) {
-        bufferPtr = workspaceBuffer_;
-        sizePtr = &needSize;
-      }
-    }
-
-    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
-    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    if (batchSize == 1) {
-      for (size_t g = 0; g < groups_; g++) {
-        nnp_status status =
-            nnp_convolution_inference(algorithm_,
-                                      transform_strategy_,
-                                      inputChannels / groups_,
-                                      outputChannels / groups_,
-                                      inputSize,
-                                      padding,
-                                      kernelSize,
-                                      outputSubsampling,
-                                      inputData + inputOffset * g,
-                                      filterData + filterOffset * g,
-                                      nullptr, /* bias */
-                                      outputData + outputOffset * g,
-                                      bufferPtr,
-                                      sizePtr,
-                                      nnp_activation_identity,
-                                      nullptr,
-                                      threadpool_, /* threadpool */
-                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-    } else {
-      // only supports stride = 1
-      CHECK_EQ(strideH(), 1);
-      CHECK_EQ(strideW(), 1);
-
-      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
-      CHECK_EQ(groups_, static_cast<size_t>(1));
-      nnp_status status = nnp_convolution_output(algorithm_,
-                                                 batchSize,
-                                                 inputChannels,
-                                                 outputChannels,
-                                                 inputSize,
-                                                 padding,
-                                                 kernelSize,
-                                                 inputData,
-                                                 filterData,
-                                                 nullptr, /* bias */
-                                                 outputData,
-                                                 bufferPtr,
-                                                 sizePtr,
-                                                 nnp_activation_identity,
-                                                 nullptr,
-                                                 threadpool_, /* threadpool */
-                                                 nullptr);
-      CHECK_EQ(status, nnp_status_success);
-    }
-  }
-
-  static void create_nnpack_threadpool() {
-    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
-  }
-
- private:
-  nnp_convolution_algorithm algorithm_;
-  nnp_convolution_transform_strategy transform_strategy_;
-  void* workspaceBuffer_;
-  size_t workspaceSize_;
-  static pthreadpool_t threadpool_;
-};
-
-template <DeviceType Device>
-pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
-
-REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
-
-}  // namespace paddle
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp b/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
deleted file mode 100644
index a2db83f5a..000000000
--- a/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/function/ConvOpTest.h"
-
-namespace paddle {
-
-TEST(NNPACK, Forward) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-TEST(NNPACK, Depthwise) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/CMakeLists.txt b/paddle/legacy/gserver/CMakeLists.txt
deleted file mode 100644
index 6dc877dd9..000000000
--- a/paddle/legacy/gserver/CMakeLists.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-# Gserver package contains:
-#   * Layers
-#   * Activations
-#   * DataProviders
-#   * Evaluators
-#   * GradientMachines(NeuralNetwork)
-file(GLOB_RECURSE GSERVER_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
-file(GLOB_RECURSE GSERVER_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cpp")
-set(GSERVER_SOURCES
-    layers/LstmCompute.cu
-    layers/GruCompute.cu
-    ${GSERVER_SOURCES})
-
-macro(filter_test VAR_NAME)
-    set(tmp)
-    foreach(p IN LISTS ${VAR_NAME})
-        if(NOT ${p} MATCHES ".*tests/.*")
-             set(tmp ${p} ${tmp})
-        endif()
-    endforeach()
-    set(${VAR_NAME} ${tmp})
-endmacro()
-
-filter_test(GSERVER_HEADER)
-filter_test(GSERVER_SOURCES)
-
-if(NOT WITH_MKLDNN)
-    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
-    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
-    list(REMOVE_ITEM GSERVER_HEADER ${DNN_HEADER})
-    list(REMOVE_ITEM GSERVER_SOURCES ${DNN_SOURCES})
-    message(STATUS "Skip compiling with MKLDNNLayers and MKLDNNActivations")
-else()
-    message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
-endif()
-
-if(NOT WITH_MKLML)
-    file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
-    file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
-    list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
-    list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
-    message(STATUS "Skip compiling with MKLPackedLayers")
-else()
-    message(STATUS "Compile with MKLPackedLayers")
-endif()
-
-if(NOT WITH_GPU)
-    list(REMOVE_ITEM GSERVER_HEADER
-        layers/CudnnConvBaseLayer.h
-        layers/CudnnConvLayer.h
-        layers/CudnnConvTransLayer.h
-        layers/CudnnPoolLayer.h
-        layers/CudnnBatchNormLayer.h)
-
-    list(REMOVE_ITEM GSERVER_SOURCES
-        layers/CudnnConvBaseLayer.cpp
-        layers/CudnnConvLayer.cpp
-        layers/CudnnConvTransLayer.cpp
-        layers/CudnnPoolLayer.cpp
-        layers/CudnnBatchNormLayer.cpp)
-    compile_cu_as_cpp(layers/LstmCompute.cu)
-    compile_cu_as_cpp(layers/GruCompute.cu)
-endif()
-
-if(NOT WITH_PYTHON)
-    list(REMOVE_ITEM GSERVER_SOURCES
-            dataproviders/PyDataProvider.cpp)
-    
-    list(REMOVE_ITEM GSERVER_HEADER
-            dataproviders/PyDataProvider.h)
-endif()
-
-if(MOBILE_INFERENCE)
-    # Remove evaluators
-    list(REMOVE_ITEM GSERVER_SOURCES
-         layers/ValidationLayer.cpp
-         evaluators/Evaluator.cpp
-         evaluators/DetectionMAPEvaluator.cpp
-         evaluators/CTCErrorEvaluator.cpp
-         evaluators/ChunkEvaluator.cpp)
-
-    # Remove dataproviders
-    list(REMOVE_ITEM GSERVER_SOURCES
-         dataproviders/DataProvider.cpp
-         dataproviders/MultiDataProvider.cpp
-         dataproviders/PyDataProvider2.cpp
-         dataproviders/PyDataProvider.cpp)
-
-    # Remove useless gradientmachines
-    list(REMOVE_ITEM GSERVER_SOURCES
-         gradientmachines/MultiNetwork.cpp
-         gradientmachines/RecurrentGradientMachine.cpp
-         gradientmachines/ParallelNeuralNetwork.cpp
-         gradientmachines/GradientMachineMode.cpp
-         gradientmachines/MultiGradientMachine.cpp)
-
-    # Remove layers that used in training
-    list(REMOVE_ITEM GSERVER_SOURCES
-    	 layers/RecurrentLayerGroup.cpp
-         layers/CostLayer.cpp
-         layers/MultiBoxLossLayer.cpp
-         layers/WarpCTCLayer.cpp
-         layers/CTCLayer.cpp
-         layers/LinearChainCTC.cpp
-         layers/PrintLayer.cpp)
-    list(REMOVE_ITEM GSERVER_SOURCES
-         layers/OuterProdLayer.cpp
-         layers/SumToOneNormLayer.cpp
-         layers/ConvShiftLayer.cpp
-         layers/InterpolationLayer.cpp
-         layers/AgentLayer.cpp
-         layers/DotMulOperator.cpp
-         layers/GruStepLayer.cpp
-         layers/LstmStepLayer.cpp
-         layers/ConvexCombinationLayer.cpp
-         layers/Conv3DLayer.cpp
-         layers/DeConv3DLayer.cpp
-         layers/CropLayer.cpp
-         layers/CrossEntropyOverBeam.cpp
-         layers/DataNormLayer.cpp
-         layers/FeatureMapExpandLayer.cpp
-         layers/HierarchicalSigmoidLayer.cpp
-         layers/MultinomialSampler.cpp
-         layers/NCELayer.cpp
-         layers/KmaxSeqScoreLayer.cpp
-         layers/MDLstmLayer.cpp
-         layers/MultiplexLayer.cpp
-         layers/PadLayer.cpp
-         layers/Pool3DLayer.cpp
-         layers/ResizeLayer.cpp
-         layers/RotateLayer.cpp
-         layers/RowConvLayer.cpp
-         layers/RowL2NormLayer.cpp
-         layers/SamplingIdLayer.cpp
-         layers/ScaleShiftLayer.cpp
-         layers/SelectiveFullyConnectedLayer.cpp
-         layers/SpatialPyramidPoolLayer.cpp
-         layers/BilinearInterpLayer.cpp
-         layers/ClipLayer.cpp)
-endif()
-
-if(WITH_GPU)
-    cuda_add_library(paddle_gserver ${GSERVER_SOURCES})
-else()
-    add_library(paddle_gserver STATIC
-        ${GSERVER_SOURCES})
-endif()
-
-add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.cpp b/paddle/legacy/gserver/activations/ActivationFunction.cpp
deleted file mode 100644
index ae07c7e6d..000000000
--- a/paddle/legacy/gserver/activations/ActivationFunction.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ActivationFunction.h"
-
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <thread>
-#include <type_traits>
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Logging.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "MKLDNNActivation.h"
-#endif
-
-namespace paddle {
-
-static ClassRegistrar<ActivationFunction> gActivationRegistrar;
-/**
- * @def ACTIVATION_CLASS_NAME
- * @brief Macro for getting derived activation class name
- * @note ACTIVATION_CLASS_NAME(softmax) softmax_;
- * means softmaxActivation softmax_;
- */
-#define ACTIVATION_CLASS_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Activation
-/**
- * @def BEGIN_DEFINE_ACTIVATION
- * @brief Macro for defining a devried activation class
- */
-#define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
-  class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
-   private:                                                                  \
-    static const std::string name;                                           \
-                                                                             \
-   public:                                                                   \
-    const std::string& getName() const { return name; }
-/**
- * @def END_DEFINE_ACTIVATION
- * @brief Macro for registering a derived activation class
- */
-#define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
-  }                                                                \
-  ;                                                                \
-  const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
-      #ACTIVATION_NAME;                                            \
-  static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
-    gActivationRegistrar                                           \
-        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
-            #ACTIVATION_NAME);                                     \
-  });
-
-/**
- * @brief The IdentityActivation class
- *
- * Do nothing when forward/backward.
- */
-class IdentityActivation : public ActivationFunction {
- public:
-  static const std::string name;
-  Error __must_check forward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  Error __must_check backward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  const std::string& getName() const { return name; }
-};
-const std::string IdentityActivation::name = "";
-static InitFunction __reg_activation__identity([] {
-  gActivationRegistrar.registerClass<IdentityActivation>("");
-  gActivationRegistrar.registerClass<IdentityActivation>("linear");
-});
-
-/**
- * @brief Sigmoid Activation
- * \f[
- * f(z) = \frac{1}{1+exp(-z)}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sigmoid)
-Error __must_check forward(Argument& act) {
-  act.value->sigmoid(*act.value);
-  return Error();
-}
-Error __must_check backward(Argument& act) {
-  act.grad->sigmoidDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sigmoid)
-
-/**
- * @brief Softmax Activation
- * \f[
- * P(y=j|x) = \frac{e^{x^Tw_j}}{\sum^K_{k=1}e^{x^Tw_k}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softmax)
-private:
-MatrixPtr sftMaxSum_;
-MatrixPtr sftMaxDot_;
-
-public:
-Error __must_check forward(Argument& act) {
-  act.value->softmax(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  MatrixPtr outputV = act.value;
-  MatrixPtr outputG = act.grad;
-
-  if (outputG->useGpu()) {
-    outputG->softmaxBackward(*outputV);
-  } else {
-    SetDevice device(act.deviceId);
-    Matrix::resizeOrCreate(sftMaxDot_,
-                           outputG->getHeight(),
-                           outputG->getWidth(),
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-    Matrix::resizeOrCreate(sftMaxSum_,
-                           outputG->getHeight(),
-                           1,
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-
-    sftMaxDot_->dotMul(*outputG, *outputV);
-    sftMaxSum_->colMerge(*sftMaxDot_);
-
-    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(softmax)
-
-/**
- * @brief Sequence_softmax Activation
- * @note Softmax on all frames of one sequence.
- * Width of frame must be one.
- */
-BEGIN_DEFINE_ACTIVATION(sequence_softmax)
-private:
-ACTIVATION_CLASS_NAME(softmax) softmax_;
-Argument argument_;
-
-public:
-Error __must_check forward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  if (!argument_.value) {
-    argument_.value = Matrix::create(nullptr,
-                                     /* height= */ 1,
-                                     1,
-                                     /* trans= */ false,
-                                     useGpu(act.deviceId));
-    argument_.grad = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    useGpu(act.deviceId));
-  }
-
-  auto starts =
-      act.hasSubseq()
-          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
-          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
-  act.value->sequenceSoftmax(*act.value, *starts);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  size_t numSequences =
-      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
-  const int* starts = act.getCpuStartPositions();
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    // TODO(Dangqingqing) optimization for GPU
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    argument_.value->setData(act.value->getData() + offset, 1UL, size);
-    argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
-
-    Error err = softmax_.backward(argument_);
-    if (!err.isOK()) return err;
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(sequence_softmax)
-
-/*
- * @brief SoftSign Activation.
- * \f[
- * f(z) = \frac{z}{1 + |z|}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softsign)
-private:
-MatrixPtr denominator_;
-
-Error __must_check forward(Argument& act) {
-  size_t height = act.value->getHeight();
-  size_t width = act.value->getWidth();
-  Matrix::resizeOrCreate(
-      denominator_, height, width, false, useGpu(act.deviceId));
-  denominator_->assign(*act.value);
-  denominator_->abs2();
-  denominator_->add(1.);
-
-  act.value->dotDiv(*act.value, *denominator_);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  denominator_->square2();
-  denominator_->scalarDiv(*denominator_, 1.);
-  act.grad->dotMul(*act.grad, *denominator_);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softsign)
-
-/**
- * @brief Relu Activation.
- * forward. y = max(0, z)
- *
- * derivative of relu is:
- *
- *    1 if z > 0
- *
- *    0 otherwise.
- */
-BEGIN_DEFINE_ACTIVATION(relu)
-Error __must_check forward(Argument& act) {
-  act.value->relu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->reluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(relu)
-
-/**
- * @brief BRelu Activation.
- *
- * forward. y = min(24, max(0, z))
- *
- * derivative of brelu is:
- *
- *    1 if 0 < z < 24
- *
- *    0 otherwise.
- *
- * TODO(yuyang18): Remove magic number 24 or make it configuable.
- */
-BEGIN_DEFINE_ACTIVATION(brelu)
-Error __must_check forward(Argument& act) {
-  act.value->brelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->breluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(brelu)
-
-/**
- * @brief Tanh Activation.
- * \f[
- * f(z) = tanh(z)=\frac{e^z-e^{-z}}{e^z+e^{-z}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(tanh)
-Error __must_check forward(Argument& act) {
-  act.value->tanh(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->tanhDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(tanh)
-
-/**
- * @brief Scaled Tanh Activation
- * \f[
- * f(z) = 1.7159 * tanh(2/3*z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(stanh)
-private:
-real a, b;
-
-public:
-ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-Error __must_check forward(Argument& act) {
-  act.value->scaledTanh(*act.value, a, b);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->scaledTanhDerivative(*act.value, a, b);
-  return Error();
-}
-END_DEFINE_ACTIVATION(stanh)
-
-/**
- * @brief Soft Relu Activation.
- * \f[
- * f(z) = ln(1+e^z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softrelu)
-Error __must_check forward(Argument& act) {
-  act.value->softrelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->softreluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softrelu)
-
-/**
- * @brief Abs Activation.
- * Forward: f(z) = abs(z)
- *
- * Derivative:
- *
- *     1   if z>0
- *
- *    -1   if z<0
- *
- *     0   if z=0
- */
-BEGIN_DEFINE_ACTIVATION(abs)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->abs2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->absDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(abs)
-
-/**
- * @brief Square Activation.
- * \f[
- * f(z) = z^2.
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(square)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->square2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->squareDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(square)
-
-/**
- * @brief Exponential Activation.
- * \f[
- * f(z) = e^z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(exponential)
-Error __must_check forward(Argument& act) {
-  act.value->exp2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->expDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(exponential)
-
-/**
- * @brief Reciprocal Activation.
- * \f[
- * f(z) = 1/z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(reciprocal)
-Error __must_check forward(Argument& act) {
-  act.value->reciprocal2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotMulSquare(*act.value);
-  act.grad->neg();
-  return Error();
-}
-END_DEFINE_ACTIVATION(reciprocal)
-
-/**
- * @brief Square Root Activation.
- * \f[
- * f(z) = sqrt(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sqrt)
-Error __must_check forward(Argument& act) {
-  act.value->sqrt2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.value);
-  act.grad->mulScalar(0.5);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sqrt)
-
-/**
- * @brief Logarithm Activation.
- * \f[
- * f(z) = log(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(log)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->log2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(log)
-
-ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_WITH_MKLDNN
-  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
-    return MKLDNNActivation::create(type);
-  }
-#endif
-
-  return gActivationRegistrar.createByType(type);
-}
-
-std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
-  std::vector<std::string> types;
-  gActivationRegistrar.forEachType(
-      [&](const std::string& type) { types.push_back(type); });
-  return types;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.h b/paddle/legacy/gserver/activations/ActivationFunction.h
deleted file mode 100644
index 8bc5b0f52..000000000
--- a/paddle/legacy/gserver/activations/ActivationFunction.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/legacy/utils/Error.h"
-
-namespace paddle {
-
-struct Argument;
-/**
- * @brief Activation function is a function that transforms a set of input
- * signals into an output signals. The purpose of the activation function
- * is to introduce non-liearilty into the network.
- *
- * @note Common activation function are provieded, including linear,
- * sigmoid, softmax, sequence_max, relu, brelu, tanh, stanh,
- * softrelu, abs, square, exponential.
- *
- */
-class ActivationFunction {
- public:
-  static ActivationFunction* create(const std::string& type);
-  static std::vector<std::string> getAllRegisteredTypes();
-
-  ActivationFunction() {}
-
-  virtual ~ActivationFunction() {}
-
-  /**
-   * @brief Foward propagation
-   *
-   * act.value <- f(act.value),
-   * where f is the activation function.
-   * Suppose that before calling forward(), act.value is x and
-   * after forward() is called, act.value is y, then y = f(x).
-   *
-   * Usually, act is Layer::output_
-   */
-  virtual Error __must_check forward(Argument& act) = 0;
-
-  /**
-   * @brief Backward propagaion
-   *
-   * x and y are defined in the above comment for forward().
-   * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
-   * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
-   */
-  virtual Error __must_check backward(Argument& act) = 0;
-
-  virtual const std::string& getName() const = 0;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.cpp b/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
deleted file mode 100644
index 2eed7af70..000000000
--- a/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNActivation.h"
-#include "mkldnn.hpp"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-
-namespace paddle {
-
-static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
-/**
- * @def MKLDNN_ACTIVATION_CLASS_NAME
- * @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
- * means mkldnn_reluActivation relu_;
- */
-#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
-
-/**
- * @def BEGIN_MKLDNN_ACTIVATION
- */
-#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
-  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
-/**
- * @def END_MKLDNN_ACTIVATION
- */
-#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
- private:                                                          \
-  static const std::string name;                                   \
-                                                                   \
- public:                                                           \
-  const std::string& getName() const { return name; }              \
-  }                                                                \
-  ;                                                                \
-  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
-      "mkldnn_" #ACT_TYPE;                                         \
-  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
-    gMKLDNNActivationRegistrar                                     \
-        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
-            "mkldnn_" #ACT_TYPE);                                  \
-  });
-
-/**
- * @def DEFINE_MKLDNN_ACTIVATION
- */
-#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
-  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
-  END_MKLDNN_ACTIVATION(ACT_TYPE)
-
-/**
- * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
- */
-#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
-    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
-  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
- private:                                                            \
-  static const float alpha;                                          \
-  static const float bwdAlpha;                                       \
-                                                                     \
- public:                                                             \
-  float getAlpha() const { return alpha; }                           \
-  float getBwdAlpha() const { return bwdAlpha; }                     \
-  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
-
-/**
- * @brief MKLDNN Relu Activation.
- * Actually mkldnn_relu is Leaky Relu.
- *  f(x) = x                   (x >= 0)
- *  f(x) = negative_slope * x  (x <  0)
- * @note the negative_slope should be -0.f in forward
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
-
-/**
- * @brief MKLDNN Tanh Activation.
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
-
-/**
- * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
- *  f(x) = x                              (x >= 0)
- *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
-
-mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
-  const std::map<std::string, mkldnn::algorithm> algoMap = {
-      {"relu", algorithm::eltwise_relu},
-      {"tanh", algorithm::eltwise_tanh},
-      {"elu", algorithm::eltwise_elu}};
-  type.erase(0, 7);  // remove mkldnn_
-  algorithm algo = (algorithm)0;
-  mapGet(type, algoMap, &algo);
-  return algo;
-}
-
-void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
-  if (cnt_ == act.value->getElementCnt()) {
-    return;
-  }
-  MKLDNNActivation::resetFwd(act);
-  // note: alpha represents the NegativeSlope when used in relu.
-  float alpha = getAlpha();
-  float beta = getBeta();
-  algorithm algo = getAlgo(this->getName());
-  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
-                                   algo,
-                                   val_->getMemoryDesc(),
-                                   alpha,
-                                   beta);
-  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
-  // use inplace for forward but save input value before submit
-  inVal_ = val_;
-  copyInVal_ = nullptr;
-  if (act.grad && algo == algorithm::eltwise_tanh) {
-    // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
-    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
-    CHECK(copyInVal_) << "should not be emptry";
-    pipelineFwd_.push_back(*copyInVal_);
-  }
-  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
-  pipelineFwd_.push_back(*fwd_);
-  needResetBwd_ = true;
-}
-
-void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
-  if (!needResetBwd_) {
-    return;
-  }
-  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-  needResetBwd_ = false;
-  algorithm algo = getAlgo(this->getName());
-  float alpha = getBwdAlpha();
-  float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
-  auto eng = CPUEngine::Instance().getEngine();
-  auto bwdDesc = eltwise_bwd::desc(
-      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
-  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
-  CHECK(inVal_);
-  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
-  pipelineBwd_.clear();
-  pipelineBwd_.push_back(*bwd_);
-}
-
-/**
- * @brief MKLDNN Softmax Activation
- */
-DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
-
-void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
-  if (cnt_ == act.value->getElementCnt()) {
-    return;
-  }
-  MKLDNNActivation::resetFwd(act);
-  int axis = 1;
-  auto fwdDesc = softmax_fwd::desc(
-      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
-  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
-  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
-  pipelineFwd_.push_back(*fwd_);
-}
-
-Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
-  resetFwd(act);
-  stream_->submit(pipelineFwd_);
-  real* v = act.value->getData();
-  real threshold = exp(-64);
-#pragma omp parallel for
-  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
-    v[i] = v[i] < threshold ? threshold : v[i];
-  }
-  return Error();
-}
-
-Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
-  MatrixPtr outputV = act.value;
-  MatrixPtr outputG = act.grad;
-  Matrix::resizeOrCreate(sftMaxDot_,
-                         outputG->getHeight(),
-                         outputG->getWidth(),
-                         /* trans */ false,
-                         /* useGpu */ false);
-  Matrix::resizeOrCreate(sftMaxSum_,
-                         outputG->getHeight(),
-                         1,
-                         /* trans */ false,
-                         /* useGpu */ false);
-  sftMaxDot_->dotMul(*outputG, *outputV);
-  sftMaxSum_->colMerge(*sftMaxDot_);
-  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-  return Error();
-}
-
-ActivationFunction* MKLDNNActivation::create(const std::string& type) {
-  return gMKLDNNActivationRegistrar.createByType(type);
-}
-
-std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
-  std::vector<std::string> types;
-  gMKLDNNActivationRegistrar.forEachType(
-      [&](const std::string& type) { types.push_back(type); });
-  return types;
-}
-
-void MKLDNNActivation::resetFwd(Argument& act) {
-  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-  cnt_ = act.value->getElementCnt();
-  pipelineFwd_.clear();
-  stream_.reset(new MKLDNNStream());
-  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
-  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
-  if (val_ == nullptr) {
-    int bs = act.getBatchSize();
-    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
-    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
-    int ic = cnt_ / bs / ih / iw;
-    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
-    val_ = MKLDNNMatrix::create(
-        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
-    CHECK(val_);
-    val_->downSpatial();
-  }
-}
-
-Error __must_check MKLDNNActivation::forward(Argument& act) {
-  resetFwd(act);
-  stream_->submit(pipelineFwd_);
-  return Error();
-}
-Error __must_check MKLDNNActivation::backward(Argument& act) {
-  resetBwd(act);
-  stream_->submit(pipelineBwd_);
-  return Error();
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.h b/paddle/legacy/gserver/activations/MKLDNNActivation.h
deleted file mode 100644
index 59c447ad0..000000000
--- a/paddle/legacy/gserver/activations/MKLDNNActivation.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ActivationFunction.h"
-#include "mkldnn.hpp"
-#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
-#include "paddle/legacy/math/MKLDNNMatrix.h"
-#include "paddle/legacy/parameter/Argument.h"
-
-namespace paddle {
-
-/**
- * @brief Base class of MKLDNN Activation.
- * Common activation function are provieded,
- * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
- */
-class MKLDNNActivation : public ActivationFunction {
- protected:
-  // input value element count
-  size_t cnt_;
-  // should not merge the resetBwd into resetFwd,
-  // because the grad data would be changing before backward.
-  bool needResetBwd_;
-  // mkldnn matrix, primitive, stream and pipeline
-  MKLDNNMatrixPtr val_;
-  MKLDNNMatrixPtr grad_;
-  std::shared_ptr<mkldnn::engine> engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwd_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
- public:
-  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
-  ~MKLDNNActivation() {}
-  static ActivationFunction* create(const std::string& type);
-  static std::vector<std::string> getAllRegisteredTypes();
-  virtual const std::string& getName() const = 0;
-  /**
-   * reset the forward primitives
-   */
-  virtual void resetFwd(Argument& act);
-  /**
-   * reset the backward primitives,
-   * can not merge this functions into resetFwd as the grad data
-   * would be changing before backward.
-   */
-  virtual void resetBwd(Argument& act) {}
-  virtual Error __must_check forward(Argument& act);
-  virtual Error __must_check backward(Argument& act);
-};
-
-/**
- * @brief Base class of MKLDNN Eltwise Activation,
- * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
- */
-class MKLDNNEltwiseActivation : public MKLDNNActivation {
-  typedef mkldnn::eltwise_forward eltwise_fwd;
-  typedef mkldnn::eltwise_backward eltwise_bwd;
-  typedef mkldnn::algorithm algorithm;
-
- protected:
-  // save the forward primitive desc, which can be used backward
-  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
-  // eltwise_bwd need src input value
-  MKLDNNMatrixPtr inVal_;
-  // use for copy data
-  std::shared_ptr<mkldnn::reorder> copyInVal_;
-
- public:
-  MKLDNNEltwiseActivation() {}
-  ~MKLDNNEltwiseActivation() {}
-  virtual const std::string& getName() const = 0;
-
-  // in common, the alpha of forward and backward should be equal.
-  // but for relu, to avoid negative value, they should be opposite
-  virtual float getAlpha() const = 0;
-  virtual float getBwdAlpha() const = 0;
-  virtual float getBeta() const { return 0.f; }
-  virtual algorithm getAlgo(std::string type) const;
-  void resetFwd(Argument& act) override;
-  void resetBwd(Argument& act) override;
-};
-
-/**
- * @brief Base class of MKLDNN softmax Activation,
- * only have mkldnn forward, use cpu implement for backward.
- */
-class MKLDNNSoftmaxActivation : public MKLDNNActivation {
-  typedef mkldnn::softmax_forward softmax_fwd;
-
- private:
-  // for backward
-  MatrixPtr sftMaxSum_;
-  MatrixPtr sftMaxDot_;
-
- public:
-  MKLDNNSoftmaxActivation() {}
-  ~MKLDNNSoftmaxActivation() {}
-  virtual const std::string& getName() const = 0;
-  void resetFwd(Argument& act) override;
-  Error __must_check forward(Argument& act) override;
-  Error __must_check backward(Argument& act) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.cpp b/paddle/legacy/gserver/dataproviders/DataProvider.cpp
deleted file mode 100644
index b67af8a32..000000000
--- a/paddle/legacy/gserver/dataproviders/DataProvider.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataProvider.h"
-
-#include <unistd.h>
-#include <algorithm>
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-void BufferBatch::swap(BufferBatch* bufBatch) {
-  DataBatch* batchData = bufBatch->getDataBatch();
-  hl_event_t hlEvent = bufBatch->getCuEvent();
-  hl_stream_t hlStream = bufBatch->getCuStream();
-  bufBatch->setDataBatch(batchData_);
-  bufBatch->setCuStream(hlStream_);
-  bufBatch->setCuEvent(hlEvent_);
-
-  batchData_ = batchData;
-  hlEvent_ = hlEvent;
-  hlStream_ = hlStream;
-}
-
-void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
-  if (batchData_ == NULL) {
-    batchData_ = new DataBatch();
-  }
-  std::vector<Argument>& destData = batchData_->getStreams();
-  int numStreams = srcBatch->getNumStreams();
-  destData.resize(numStreams);
-  batchData_->setSize(srcBatch->getSize());
-  if (useGpu) {
-    createCuEvent();
-  }
-
-  for (int i = 0; i < numStreams; i++) {
-    destData[i].resizeAndCopyFrom(srcBatch->getStream(i), useGpu, hlStream_);
-  }
-  if (useGpu) {
-    hl_stream_record_event(hlStream_, hlEvent_);
-  }
-}
-
-DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
-                           bool useGpu,
-                           int64_t batchSize) {
-  batchSize_ = batchSize;
-  dataPool_ = dataPool;
-  useGpu_ = useGpu;
-  dataQueue_ = new BufferBatchQueue();
-  bufferQueue_ = new BufferBatchQueue();
-
-  // insert a empty buffer
-  bufferQueue_->enqueue(new BufferBatch());
-  stopping_ = false;
-  pending_ = true;
-}
-
-DoubleBuffer::~DoubleBuffer() {
-  finishAsyncLoad();
-  while (dataQueue_->size()) {
-    BufferBatch* dataBtch = dataQueue_->dequeue();
-    delete dataBtch;
-    dataBtch = NULL;
-  }
-  while (bufferQueue_->size()) {
-    BufferBatch* bufBtch = bufferQueue_->dequeue();
-    delete bufBtch;
-    bufBtch = NULL;
-  }
-  delete dataQueue_;
-  dataQueue_ = NULL;
-  delete bufferQueue_;
-  bufferQueue_ = NULL;
-}
-
-void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
-  // get data
-  BufferBatch* batch = dataQueue_->dequeue();
-  batch->syncEvent();  // when use GPU, need synchronized with the cuEvent
-  *dataBatch = *(batch->getDataBatch());
-
-  // push anothor buffer
-  if (*usingBatch_ == nullptr) {
-    *usingBatch_ = std::make_shared<BufferBatch>();
-  }
-
-  // Mark the using-batch
-  batch->swap((*usingBatch_).get());
-  bufferQueue_->enqueue(batch);
-
-  if (0 == dataBatch->getSize()) {
-    setPending(true);
-  }
-}
-
-void DoubleBuffer::insertOneBatch(DataBatch* batch) {
-  while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) {  // time out
-    if (stopping_) return;
-  }
-  BufferBatch* bufBatch = bufferQueue_->dequeue();
-  // clone and copy the data from an Threadlocal Variable
-  bufBatch->clone(batch, useGpu_);
-  dataQueue_->enqueue(bufBatch);
-}
-
-void DoubleBuffer::asyncLoadBatch() {
-  int64_t actualSize = 0;
-  if (useGpu_) {
-    hl_set_device(FLAGS_gpu_id);
-  }
-  setPending(false);
-
-  while (true) {
-    taskReadySem_.wait();
-    if (stopping_) break;
-
-    while (batchSize_ == 0 && !stopping_) {
-      usleep(5);
-    }
-    if (stopping_) break;
-
-    do {
-      DataBatch newBatch;
-      {
-        REGISTER_TIMER("getNextBatchInternal");
-        actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
-      }
-      insertOneBatch(&newBatch);
-    } while (actualSize > 0 && !stopping_);
-  }
-}
-
-void DoubleBuffer::startAsyncLoad() {
-  if (asyncLoader_ == nullptr) {
-    asyncLoader_.reset(new std::thread([this]() { this->asyncLoadBatch(); }));
-  }
-  taskReadySem_.post();
-}
-
-ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
-    DataProvider::registrar_;
-
-DataProvider* DataProvider::create(const DataConfig& config,
-                                   const ModelConfig& modelConfig,
-                                   bool useGpu) {
-  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
-}
-
-REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
-REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-
-int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
-  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
-                                    : getNextBatchInternal(size, batch);
-
-  if (!batchSize) return 0;
-
-  if (!config_.constant_slots_size()) return batchSize;
-
-  auto& constantSlots = *constantSlots_;
-  constantSlots.resize(config_.constant_slots_size());
-
-  for (int i = 0; i < config_.constant_slots_size(); ++i) {
-    MemoryHandlePtr handle =
-        constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
-    Matrix::resizeOrCreate(constantSlots[i],
-                           batchSize,
-                           1,         // = width
-                           false,     // = trans
-                           useGpu_);  // = useGpu
-    if (handle != constantSlots[i]->getMemoryHandle()) {
-      // memory buf was reallocated. We need to initialize the value
-      constantSlots[i]->assign(config_.constant_slots(i));
-    }
-    batch->appendData(constantSlots[i],
-                      batch->getStream(0).sequenceStartPositions);
-  }
-
-  return batchSize;
-}
-
-int64_t DataProvider::getNextBatchFromBuffer(int64_t size, DataBatch* batch) {
-  CHECK(doubleBuffer_ != nullptr);
-
-  if (doubleBuffer_->getBatchSize() != size) {
-    doubleBuffer_->setBatchSize(size);
-  }
-
-  doubleBuffer_->removeOneBatch(batch);
-  return batch->getSize();
-}
-
-void DataProvider::initAsyncLoader() {
-  if (doubleBuffer_ == nullptr) {
-    doubleBuffer_.reset(new DoubleBuffer(this, useGpu_));
-  }
-  useGpu_ = false;  // Avoid D2D copy, it will delay the computing performance
-}
-
-SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
-                                               bool useGpu,
-                                               bool withInfo)
-    : DataProvider(config, useGpu) {
-  /* initialize the size of a sample, and the buffer */
-  sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
-  bufferCapacity_ = config_.buffer_capacity();
-  withInfo_ = withInfo;
-  sampleNumInBuf_ = 0;
-  nextItemIndex_ = 0;
-
-  /* malloc buffer in cpu */
-  hInputDataBuf_ = std::make_shared<CpuMatrix>(bufferCapacity_, sampleDim_);
-  hInputLabelBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
-  hInputInfoBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
-}
-
-void SimpleDataProviderBase::shuffle() {
-  int i, t;
-  int len = sampleNumInBuf_;
-  std::vector<real> temp(sampleDim_);
-  real* data = hInputDataBuf_->getData();
-  int* label = hInputLabelBuf_->getData();
-  int* info = hInputInfoBuf_->getData();
-  int sampleSz = sizeof(real) * sampleDim_;
-  for (i = 0; i < len; i++) {
-    int randNum = rand();  // NOLINT TODO(yuyang18): Use rand_r instead?
-    t = randNum % (len - i) + i;
-    // swap
-    if (i != t) {
-      // swap data
-      memcpy(&temp[0], &data[i * sampleDim_], sampleSz);
-      memcpy(&data[i * sampleDim_], &data[t * sampleDim_], sampleSz);
-      memcpy(&data[t * sampleDim_], &temp[0], sampleSz);
-      std::swap(label[i], label[t]);
-      if (withInfo_) {
-        std::swap(info[i], info[t]);
-      }
-    }
-  }
-}
-
-int64_t SimpleDataProviderBase::getNextBatchInternal(int64_t size,
-                                                     DataBatch* batch) {
-  CHECK(batch != NULL);
-  batch->clear();
-
-  int64_t startIndex;
-  int64_t cpySize;
-
-  std::lock_guard<RWLock> guard(lock_);
-  if (sampleNumInBuf_ - nextItemIndex_ < size) {
-    int64_t n = fillBuffer();
-    VLOG(1) << "fillBuffer return " << n << " samples.\n";
-  }
-
-  startIndex = nextItemIndex_;
-  cpySize = std::min(size, sampleNumInBuf_ - nextItemIndex_);
-  nextItemIndex_ += cpySize;
-
-  if (cpySize > 0) {
-    real* data = hInputDataBuf_->getData() + startIndex * sampleDim_;
-    int* label = hInputLabelBuf_->getData() + startIndex;
-    int* info = hInputInfoBuf_->getData() + startIndex;
-
-    MatrixPtr& dataBatch = *dataBatch_;     // get the thread local object
-    IVectorPtr& labelBatch = *labelBatch_;  // get the thread local object
-    IVectorPtr& infoBatch = *infoBatch_;    // get the thread local object
-    if (!dataBatch) {
-      dataBatch = Matrix::create(cpySize, sampleDim_, false, useGpu_);
-      labelBatch = IVector::create(cpySize, useGpu_);
-      if (withInfo_) {
-        infoBatch = IVector::create(cpySize, 0);
-      }
-    } else {
-      dataBatch->resize(cpySize, sampleDim_);
-      labelBatch->resize(cpySize);
-      if (withInfo_) {
-        infoBatch->resize(cpySize);
-      }
-    }
-    dataBatch->copyFrom(data, cpySize * sampleDim_);
-    labelBatch->copyFrom(label, cpySize);
-    batch->appendData(dataBatch);
-    batch->appendLabel(labelBatch);
-    if (withInfo_) {
-      infoBatch->copyFrom(info, cpySize);
-      batch->appendLabel(infoBatch);
-    }
-  }
-
-  batch->setSize(cpySize);
-  return cpySize;
-}
-
-void SimpleDataProviderBase::reset() {
-  sampleNumInBuf_ = 0;
-  nextItemIndex_ = 0;
-  DataProvider::reset();
-}
-
-int64_t SimpleDataProviderBase::getSize() {
-  LOG(FATAL) << "Currently, not implemented";
-  return 0;
-}
-
-int64_t SimpleDataProviderBase::fillBuffer() {
-  int64_t n = sampleNumInBuf_ - nextItemIndex_;
-
-  /* flash the remaining data to the beginning of the buffer */
-  if (n > 0) {
-    hInputDataBuf_->copyFrom(
-        hInputDataBuf_->getData() + nextItemIndex_ * sampleDim_,
-        n * sampleDim_);
-    hInputLabelBuf_->copyFrom(hInputLabelBuf_->getData() + nextItemIndex_, n);
-    if (withInfo_) {
-      hInputInfoBuf_->copyFrom(hInputInfoBuf_->getData() + nextItemIndex_, n);
-    }
-  }
-
-  sampleNumInBuf_ =
-      n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
-                        hInputLabelBuf_->getData() + n,
-                        hInputInfoBuf_->getData() + n,
-                        bufferCapacity_ - n);
-
-  /* for stachastic gradient training */
-  if (!skipShuffle_) {
-    shuffle();
-  }
-
-  nextItemIndex_ = 0;
-
-  return sampleNumInBuf_;
-}
-
-SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
-    : SimpleDataProviderBase(config, useGpu, /* withInfo= */ false),
-      currentSampleIndex_(0) {
-  loadData(config_.files());
-}
-
-SimpleDataProvider::~SimpleDataProvider() {}
-
-int64_t SimpleDataProvider::fillBufferImp(real* data,
-                                          int* label,
-                                          int* info,
-                                          int64_t size) {
-  (void)info;
-  int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
-  memcpy(data,
-         &data_[currentSampleIndex_ * sampleDim_],
-         n * sampleDim_ * sizeof(real));
-  memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
-  currentSampleIndex_ += n;
-
-  return n;
-}
-
-void SimpleDataProvider::reset() {
-  currentSampleIndex_ = 0;
-  SimpleDataProviderBase::reset();
-}
-
-void SimpleDataProvider::loadData(const std::string& fileName) {
-  std::ifstream is(fileName);
-  CHECK(is) << "Fail to open " << fileName;
-  std::string line;
-  while (is) {
-    if (!getline(is, line)) break;
-    LOG(INFO) << "load data file " << line;
-    loadDataFile(line);
-  }
-  LOG(INFO) << "read done, num of instance=" << labels_.size()
-            << " data size=" << data_.size();
-}
-
-void SimpleDataProvider::loadDataFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-  std::string line;
-  std::vector<std::string> pieces;
-  while (is) {
-    if (!getline(is, line)) break;
-    str::split(line, ' ', &pieces);
-    CHECK_EQ((uint64_t)(sampleDim_ + 1), pieces.size())
-        << " Dimension mismatch, " << pieces.size() - 1 << " in " << fileName
-        << " " << sampleDim_ << " from config";
-    labels_.push_back(atoi(pieces[0].c_str()));
-    for (int i = 0; i < sampleDim_; ++i) {
-      data_.push_back(atof(pieces[i + 1].c_str()));
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.h b/paddle/legacy/gserver/dataproviders/DataProvider.h
deleted file mode 100644
index c2e1c5fdd..000000000
--- a/paddle/legacy/gserver/dataproviders/DataProvider.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <vector>
-
-#include "DataConfig.pb.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Queue.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-/**
- * @def REGISTER_DATA_PROVIDER
- * @brief Macro for registering a data provider. The class type should contain
- *        a consturctor with parameter (DataConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                    \
-    DataProvider::registrar_.registerClass(                              \
-        #__type_name,                                                    \
-        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
-          DataProvider* dp = new __class_name(conf, useGpu);             \
-          return dp;                                                     \
-        });                                                              \
-  })
-
-/**
- * @def REGISTER_DATA_PROVIDER_EX
- * @brief Macro for registering a data provider, which contains a constructor
- *        with parameter (DataConfig, ModelConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
-  static InitFunction __reg_type_##__type_name([] {                     \
-    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-class DataBatch;
-class BufferBatch;
-typedef std::shared_ptr<DataBatch> DataBatchPtr;
-typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
-/**
- * @brief Data for batch training a neural network
- */
-class DataBatch {
- public:
-  DataBatch() : size_(0) { data_.clear(); }
-  /**
-   * @brief Get batch size
-   * @return batch size
-   */
-  int64_t getSize() const { return size_; }
-  /**
-   * @brief Get num of sequences of sequence data
-   * @return num of sequences
-   */
-  int64_t getNumSequences() const {
-    if (data_.empty()) return size_;
-    return data_[0].sequenceStartPositions
-               ? data_[0].sequenceStartPositions->getSize() - 1
-               : size_;
-  }
-  /**
-   * @brief Set batch size
-   * @param[in] size size
-   */
-  void setSize(int64_t size) { size_ = size; }
-  /**
-   * @brief Get size of argument vector
-   * @return size of argument vector
-   * @note For usual supervised learning, input data and label is needed,
-   * then there will be two argument.
-   */
-  int64_t getNumStreams() const { return data_.size(); }
-
-  /**
-   * @brief Get a argument with index i
-   * @param[in] i index in argument vector
-   * @return a argument with index i
-   */
-  const Argument& getStream(int i) const { return data_[i]; }
-  /**
-   * @brief Get all argument
-   * @return an argument vector
-   */
-  std::vector<Argument>& getStreams() { return data_; }
-  /**
-   * @brief Get all argument const
-   * @return an argument vector
-   */
-  std::vector<Argument> getStreams() const { return data_; }
-  /**
-   * @brief Clear DataBatch
-   */
-  void clear() {
-    data_.clear();
-    size_ = 0;
-  }
-
-  /**
-   * @brief Append data to DataBatch
-   * @param[in] data  matrix data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(MatrixPtr data) {
-    Argument argu;
-    argu.value = data;
-    data_.push_back(argu);
-  }
-
-  /**
-   * @brief Append sequence data to DataBatch
-   * @param[in] data                      matrix data
-   * @param[in] sequenceStartPositions    sequence data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(const MatrixPtr& data,
-                  const ICpuGpuVectorPtr& sequenceStartPositions) {
-    Argument argu;
-    argu.value = data;
-    argu.sequenceStartPositions = sequenceStartPositions;
-    data_.push_back(argu);
-  }
-  /**
-   * @brief Append label data
-   * @param[in]  label    label data
-   * @param[in]  value    matrix data, default null
-   */
-  void appendLabel(IVectorPtr label, MatrixPtr value = nullptr) {
-    Argument argu;
-    argu.ids = label;
-    argu.value = value;
-    data_.push_back(argu);
-  }
-
-  /*
-   * @brief Append argument
-   * @param[in]  argus   DataBatch.getStreams()
-   * @param[in]  size    DataBatch.getSize()
-   * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
-   */
-  void appendArguments(const std::vector<Argument>& argus,
-                       int size,
-                       int dataId) {
-    size_ += size;
-    for (const auto& argu : argus) {
-      data_.push_back(argu);
-      data_.back().dataId = dataId;
-    }
-  }
-
- protected:
-  /**
-   * @brief batch size
-   */
-  int64_t size_;
-  /**
-   * @brief A batch data consist of a Argument vector,
-   * An argument corresponds to a type of input data.
-   */
-  std::vector<Argument> data_;
-};
-
-class BufferBatch {
- public:
-  BufferBatch() {
-    hlStream_ = HPPL_STREAM_DEFAULT;
-    hlEvent_ = NULL;
-    batchData_ = NULL;
-  }
-  ~BufferBatch() {
-    if (hlEvent_) {
-      hl_destroy_event(hlEvent_);
-      hlEvent_ = NULL;
-    }
-    delete batchData_;
-    batchData_ = NULL;
-  }
-
-  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
-  DataBatch* getDataBatch() { return batchData_; }
-
-  void setCuStream(hl_stream_t stream) { hlStream_ = stream; }
-  hl_stream_t getCuStream() const { return hlStream_; }
-
-  void setCuEvent(hl_event_t event) { hlEvent_ = event; }
-
-  hl_event_t getCuEvent() const { return hlEvent_; }
-
-  void createCuEvent() {
-    if (!hlEvent_) {
-      hlStream_ = HPPL_STREAM_1;
-      hl_create_event(&hlEvent_);
-    }
-  }
-
-  void syncEvent() {
-    if (hlEvent_) {
-      hl_stream_wait_event(hlStream_, hlEvent_);
-    }
-  }
-
-  void swap(BufferBatch* bufBatch);
-  void clone(DataBatch* srcBatch, bool useGpu);
-
- protected:
-  DataBatch* batchData_;
-  hl_stream_t hlStream_;
-  hl_event_t hlEvent_;
-};
-
-class DataProvider;
-typedef std::shared_ptr<DataProvider> DataProviderPtr;
-
-typedef Queue<BufferBatch*> BufferBatchQueue;
-
-class DoubleBuffer {
- public:
-  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
-  virtual ~DoubleBuffer();
-  void removeOneBatch(DataBatch* dataBatch);
-
-  void setBatchSize(int64_t newBatchSize) { batchSize_ = newBatchSize; }
-
-  int64_t getBatchSize() { return batchSize_; }
-
-  void startAsyncLoad();
-  void finishAsyncLoad() {
-    stopping_ = true;
-    taskReadySem_.post();
-    if (asyncLoader_) {
-      asyncLoader_->join();
-    }
-  }
-
-  void setPending(bool pending) { pending_ = pending; }
-
- protected:
-  virtual void asyncLoadBatch();
-  void insertOneBatch(DataBatch* batch);
-
-  DataProvider* dataPool_;
-  bool useGpu_;
-  int32_t batchSize_;
-  ThreadLocal<BufferBatchPtr> usingBatch_;
-  BufferBatchQueue* dataQueue_;
-  BufferBatchQueue* bufferQueue_;
-  std::unique_ptr<std::thread> asyncLoader_;
-  Semaphore taskReadySem_;
-  bool stopping_;
-  bool pending_;
-};
-
-/**
- * @brief Base class for DataProvider, which supplies data for training
- * @note It can supplies multiple streams of data.
- * For typical supervised training, there are two streams:
- * one is for input, one is for label.
- */
-class DataProvider {
- public:
-  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
-  static DataProvider* create(const DataConfig& config,
-                              const ModelConfig& modelConfig,
-                              bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * @brief create only used for unittest.
-   */
-  inline static DataProvider* create(const DataConfig& config,
-                                     bool useGpu = FLAGS_use_gpu) {
-    return create(config, ModelConfig(), useGpu);
-  }
-
-  DataProvider(const DataConfig& config, bool useGpu)
-      : config_(config),
-        skipShuffle_(false),
-        usageRatio_(config.usage_ratio()),
-        useGpu_(useGpu) {
-    if (config_.async_load_data()) {
-      initAsyncLoader();
-    }
-  }
-  virtual ~DataProvider() {}
-
-  const DataConfig& getConfig() const { return config_; }
-
-  void setSkipShuffle() { skipShuffle_ = true; }
-
-  /**
-   * @brief Get next batch of training samples
-   * @param[in]    size    size of training samples to get
-   * @param[out]   batch   a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatch(int64_t size, DataBatch* batch);
-
-  /**
-   * @brief Shuffle the data set
-   */
-  virtual void shuffle() = 0;
-
-  /**
-   * @brief reset all the value of index
-   * @note reset() must be called before any calls to getNextBatch()
-   * IMPORTANT: subclass reset() should always call the base class reset()
-   * at the end of the function
-   */
-  virtual void reset() {
-    if (doubleBuffer_ != nullptr) {
-      doubleBuffer_->startAsyncLoad();
-    }
-  }
-
-  /**
-   * @brief Get the size of training samples
-   * @return the number of training samples in the data set.
-   * @note return -1 to indicate unlimited number of samples.
-   */
-  virtual int64_t getSize() = 0;
-
-  /**
-   * @brief Get next batch training samples internally
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
-
- protected:
-  DataConfig config_;
-  bool skipShuffle_;
-  float usageRatio_;
-  bool useGpu_;
-  std::unique_ptr<DoubleBuffer> doubleBuffer_;
-  ThreadLocal<std::vector<MatrixPtr>> constantSlots_;
-  /**
-   * @@brief Get next batch training samples from buffer
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatchFromBuffer(int64_t size, DataBatch* batch);
-
-  void initAsyncLoader();
-};
-
-/**
- * A data provider which does nothing. It only serves as providing
- * necessary configurations such as stream_names
- */
-class DummyDataProvider : public DataProvider {
- public:
-  DummyDataProvider(const DataConfig& config, bool useGpu)
-      : DataProvider(config, useGpu) {}
-  virtual void shuffle() {}
-  virtual void reset() { DataProvider::reset(); }
-  virtual int64_t getSize() { return 0; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) {
-    (void)size;
-    (void)batch;
-    return 0;
-  }
-};
-
-/**
- * Data provider for one input and one integer label.
- */
-class SimpleDataProviderBase : public DataProvider {
- protected:
-  /// sample feature dimension
-  int64_t sampleDim_;
-  /// the number of samples
-  int64_t bufferCapacity_;
-  int64_t sampleNumInBuf_;
-  /// next item to read in buffer
-  int64_t nextItemIndex_;
-  /// some user defined info for validation
-  bool withInfo_;
-
-  /// data buffer: bufferCapacity_ * nDataDim_
-  CpuMatrixPtr hInputDataBuf_;
-
-  /// label buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputLabelBuf_;
-
-  /// info buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputInfoBuf_;
-
-  ThreadLocal<MatrixPtr> dataBatch_;
-  ThreadLocal<IVectorPtr> labelBatch_;
-  ThreadLocal<IVectorPtr> infoBatch_;
-
-  RWLock lock_;
-
- public:
-  SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
-  ~SimpleDataProviderBase() {}
-
-  void shuffle();
-
-  virtual void reset();
-
-  virtual int64_t getSize();
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-  /// return the number of samples in the buffer
-  int64_t fillBuffer();
-
- protected:
-  /**
-   * @brief Fill at most size samples into data and label.
-   *
-   * Each input is stored in contiguous memory locations in data.
-   *
-   * data[n * sampleDim_] .. data[n * sampleDim_ + sampleDim_ - 1] is for
-   * the input of the n-th sample.
-   *
-   * label[n] is the label for the n-th sample.
-   */
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size) = 0;
-};
-
-class SimpleDataProvider : public SimpleDataProviderBase {
- public:
-  SimpleDataProvider(const DataConfig& config, bool useGpu);
-  ~SimpleDataProvider();
-  virtual void reset();
-
- protected:
-  void loadData(const std::string& fileName);
-  void loadDataFile(const std::string& fileName);
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size);
-
- protected:
-  size_t currentSampleIndex_;
-  std::vector<int> labels_;
-  std::vector<real> data_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProviderGroup.h b/paddle/legacy/gserver/dataproviders/DataProviderGroup.h
deleted file mode 100644
index 91c94dc98..000000000
--- a/paddle/legacy/gserver/dataproviders/DataProviderGroup.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "DataProvider.h"
-
-namespace paddle {
-
-template <class T>
-class DataProviderGroup : public DataProvider {
- protected:
-  typedef T ProviderType;
-  typedef std::shared_ptr<ProviderType> ProviderPtrType;
-  ProviderPtrType provider_;
-
-  std::vector<std::string> fileList_;
-  std::mutex lock_;
-  std::unique_ptr<MultiThreadWorker<ProviderType>> loader_;
-
- public:
-  DataProviderGroup(const DataConfig& config, bool useGpu);
-  ~DataProviderGroup() {}
-
-  virtual void reset();
-  virtual void shuffle() {}
-  virtual int64_t getSize() { return -1; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
- private:
-  void startLoader();
-  void stopLoader();
-  void forceStopLoader();
-  ProviderPtrType loadFile(const std::vector<std::string>& fileList);
-};
-
-template <class T>
-DataProviderGroup<T>::DataProviderGroup(const DataConfig& config, bool useGpu)
-    : DataProvider(config, useGpu) {
-  // load file list
-  loadFileList(config_.files(), fileList_);
-  CHECK_GT(fileList_.size(), 0LU);
-  LOG(INFO) << "load file list, numfiles=" << fileList_.size()
-            << ", max_num_of_data_providers_in_memory="
-            << (1 + config_.file_group_conf().queue_capacity() +
-                config_.file_group_conf().load_thread_num());
-}
-
-template <class T>
-void DataProviderGroup<T>::reset() {
-  forceStopLoader();
-  CHECK(!loader_);
-  provider_ = nullptr;
-
-  // shuffle file list
-  std::shuffle(
-      fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
-
-  startLoader();
-  DataProvider::reset();
-}
-
-template <class T>
-int64_t DataProviderGroup<T>::getNextBatchInternal(int64_t size,
-                                                   DataBatch* batch) {
-  std::lock_guard<std::mutex> guard(lock_);
-
-  if (!loader_) {
-    return 0;
-  }
-  if (provider_) {
-    int64_t ret = provider_->getNextBatchInternal(size, batch);
-    if (ret > 0) {
-      return ret;
-    }
-  }
-
-  // else get data from next data provider
-  if (loader_->testResult()) {
-    LOG(INFO) << "WAIT provider";
-  }
-  provider_ = loader_->waitResult();
-  if (!provider_) {
-    stopLoader();  // All the data providers have been returned
-    return 0;
-  }
-  int64_t ret = provider_->getNextBatchInternal(size, batch);
-  CHECK(ret > 0) << "new data provider does not contain any valid samples!";
-  return ret;
-}
-
-template <class T>
-void DataProviderGroup<T>::startLoader() {
-  loader_.reset(new MultiThreadWorker<ProviderType>(
-      config_.file_group_conf().load_thread_num(),
-      config_.file_group_conf().queue_capacity()));
-
-  int loadFileCount = config_.file_group_conf().load_file_count();
-  for (size_t startPos = 0; startPos < fileList_.size();
-       startPos += loadFileCount) {
-    size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
-    std::vector<std::string> fileVec(fileList_.begin() + startPos,
-                                     fileList_.begin() + endPos);
-    loader_->addJob([this, fileVec]() -> ProviderPtrType {
-      return this->loadFile(fileVec);
-    });
-  }
-  loader_->stopAddJob();
-}
-
-template <class T>
-void DataProviderGroup<T>::stopLoader() {
-  if (loader_) {
-    loader_->stop();
-    loader_ = nullptr;
-  }
-}
-
-template <class T>
-void DataProviderGroup<T>::forceStopLoader() {
-  if (loader_) {
-    loader_->forceStop();
-    loader_ = nullptr;
-  }
-}
-
-template <class T>
-std::shared_ptr<T> DataProviderGroup<T>::loadFile(
-    const std::vector<std::string>& fileList) {
-  // disable async_load_data in sub dataprovider
-  DataConfig subConfig = config_;
-  subConfig.set_async_load_data(false);
-
-  CHECK(!fileList.empty()) << "fileList is empty";
-  ProviderPtrType provider =
-      std::make_shared<ProviderType>(subConfig, useGpu_, false);
-  provider->loadData(fileList);
-  provider->reset();
-  return provider;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp b/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
deleted file mode 100644
index e5fc6d8a8..000000000
--- a/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiDataProvider.h"
-#include <algorithm>
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-using namespace std;
-
-MultiDataProvider::MultiDataProvider(const DataConfig& config,
-                                     const ModelConfig& modelConfig,
-                                     bool useGpu)
-    : DataProvider(config, useGpu) {
-  bool atLeastOneMainDataFlag = false;
-  totalDataRatio_ = 0;
-  LOG(INFO) << "MultiDataProvider: sub data provider size: "
-            << config.sub_data_configs_size();
-  LOG(INFO) << "MultiDataProvider: for_test: " << config.for_test();
-  isTestMode_ = config.for_test();
-  for (int i = 0; i < config.sub_data_configs_size(); i++) {
-    LOG(INFO) << "dataRatio of sub(" << i
-              << ") is: " << config.sub_data_configs(i).data_ratio();
-    totalDataRatio_ += config.sub_data_configs(i).data_ratio();
-    if (config.sub_data_configs(i).is_main_data()) {
-      LOG(INFO) << "main data is [" << i << "]";
-      atLeastOneMainDataFlag = true;
-    }
-  }
-  CHECK(atLeastOneMainDataFlag) << "all sub dataproviders in MultiData do not"
-                                << " have is_main_data flag";
-  LOG(INFO) << "totalDataRatio_=" << totalDataRatio_;
-  DataConfig subConfig;
-  int subDataProviderCount = config.sub_data_configs_size();
-  if (isTestMode()) {
-    LOG(INFO) << "construct MultiDataProvider in test mode";
-  } else {
-    LOG(INFO) << "construct MultiDataProvider in train mode";
-  }
-  subDataProviders_.resize(subDataProviderCount);
-  for (int i = 0; i < subDataProviderCount; i++) {
-    subConfig = config.sub_data_configs(i);
-    if (subConfig.async_load_data()) {
-      LOG(INFO) << "can not use async_load_data in sub dataprovider of "
-                   "MultiDataProvider";
-      subConfig.set_async_load_data(false);
-    }
-    subDataProviders_[i] = std::unique_ptr<DataProvider>(
-        DataProvider::create(subConfig, modelConfig, useGpu_));
-  }
-}
-
-void MultiDataProvider::reset() {
-  for (auto& elem : subDataProviders_) {
-    elem->reset();
-  }
-  DataProvider::reset();
-}
-
-void MultiDataProvider::shuffle() {
-  for (auto& elem : subDataProviders_) {
-    elem->shuffle();
-  }
-}
-
-int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
-                                                DataBatch* batch) {
-  batch->clear();
-  for (size_t i = 0; i < subDataProviders_.size(); ++i) {
-    // calc size according to data ratio
-    int64_t subSize =
-        (int64_t)(1.0 * size * config_.sub_data_configs(i).data_ratio() /
-                  totalDataRatio_);
-    DataBatch subBatch;
-    int64_t realSize =
-        subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
-    if (realSize == 0) {
-      // current subDataProvider has no data
-      if (!isTestMode()) {
-        // in train mode
-        if (config_.sub_data_configs(i).is_main_data()) {
-          // is main data provider. then return 0
-          batch->clear();
-          return 0;
-        } else {
-          // not main data provider, reset current subDataProvider and try again
-          subDataProviders_[i]->reset();
-          subBatch.clear();
-          realSize =
-              subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
-          CHECK_GT(realSize, 0);
-        }
-      } else {
-        // in test mode, make an empty argument
-        Argument emptyArgu;
-        std::vector<Argument> argus;
-        argus.push_back(emptyArgu);
-        batch->appendArguments(argus, 0, -1);
-        continue;
-      }
-    }
-    batch->appendArguments(subBatch.getStreams(), subBatch.getSize(), i);
-  }
-  return batch->getSize();
-}
-
-REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/MultiDataProvider.h b/paddle/legacy/gserver/dataproviders/MultiDataProvider.h
deleted file mode 100644
index baa1fc019..000000000
--- a/paddle/legacy/gserver/dataproviders/MultiDataProvider.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "DataProvider.h"
-
-namespace paddle {
-
-class MultiDataProvider : public DataProvider {
- protected:
-  std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
-
- public:
-  MultiDataProvider(const DataConfig& config,
-                    const ModelConfig& modelConfig,
-                    bool useGpu);
-  ~MultiDataProvider() {}
-  virtual void reset();
-  virtual void shuffle();
-  virtual int64_t getSize() { return -1; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-  bool isTestMode() const { return isTestMode_; }
-
- private:
-  int totalDataRatio_;
-  bool isTestMode_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/ProtoReader.h b/paddle/legacy/gserver/dataproviders/ProtoReader.h
deleted file mode 100644
index 08d045226..000000000
--- a/paddle/legacy/gserver/dataproviders/ProtoReader.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/io/gzip_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message_lite.h>
-
-namespace paddle {
-
-/**
- * ProtoReader/ProtoWriter are used to read/write a sequence of protobuf
- * messages from/to i/ostream.
- */
-class ProtoReader {
- public:
-  explicit ProtoReader(std::istream* s, bool dataCompression = false) {
-    CHECK(s) << "istream pointer is nullptr";
-    istreamInput_.reset(new google::protobuf::io::IstreamInputStream(s));
-    if (dataCompression) {
-      gzipInput_.reset(
-          new google::protobuf::io::GzipInputStream(istreamInput_.get()));
-      codedInput_.reset(
-          new google::protobuf::io::CodedInputStream(gzipInput_.get()));
-    } else {
-      codedInput_.reset(
-          new google::protobuf::io::CodedInputStream(istreamInput_.get()));
-    }
-    dataCompression_ = dataCompression;
-    approximateReadedBytes_ = 0;
-    codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
-                                    kDefaultTotalBytesLimit);
-  }
-
-  /**
-   * read one message
-   */
-  bool read(google::protobuf::MessageLite* msg) {
-    if (approximateReadedBytes_ >= kMaxLimitBytes) {
-      // Once bytes we read get close to 64MB(larger than 55MB),
-      // we re-intialize the codedInputStream object.
-      approximateReadedBytes_ = 0;
-
-      /**
-       * Explicitly destroys the object owned by unique_ptr at first and then
-       * construct an new object.
-       *
-       * 1.reset()
-       *
-       * 2.reset(new ...)   <-- such sequence is EXTREAMLY important!
-       *
-       * Reason: (!!!Read me before you modify the following 2 lines of
-       * codes!!!)
-       *
-       * Otherwise, reset() method will ask the CodedInputStream constructor
-       * to construct the new object at first forcing the IstreamInputStream
-       * object to move its underlying pointer to the next 8192 bytes.
-       *
-       * Then the old object will be destroied calling
-       * IstreamInputStream::BackUp() to move the underlying pointer back.
-       * This means that the InstreamInputStream object is referenced by
-       * 2 different CodedInputStream object at the same time which "confuses"
-       * the position of istreamInput_'s underlying pointer. Such fatal
-       * confusion will lead to undefined behaviour when 'codedInput_' is
-       * used to read new data.
-       *
-       */
-      codedInput_.reset();
-      if (dataCompression_) {
-        codedInput_.reset(
-            new google::protobuf::io::CodedInputStream(gzipInput_.get()));
-      } else {
-        codedInput_.reset(
-            new google::protobuf::io::CodedInputStream(istreamInput_.get()));
-      }
-      codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
-                                      kDefaultTotalBytesLimit);
-    }
-
-    uint32_t size;
-    if (!codedInput_->ReadVarint32(&size)) {
-      return false;
-    }
-    google::protobuf::io::CodedInputStream::Limit limit =
-        codedInput_->PushLimit(size);
-    CHECK(msg->ParseFromCodedStream(codedInput_.get()));
-    codedInput_->PopLimit(limit);
-
-    /**
-     * size is varint in the data file, we don't know the length.
-     * We assume every size takes 4 bytes in the data file.
-     */
-    approximateReadedBytes_ += 4 + size;
-    return true;
-  }
-
- protected:
-  std::unique_ptr<google::protobuf::io::ZeroCopyInputStream> istreamInput_;
-  std::unique_ptr<google::protobuf::io::GzipInputStream> gzipInput_;
-  std::unique_ptr<google::protobuf::io::CodedInputStream> codedInput_;
-  bool dataCompression_;
-
-  /**
-   * This is the maximum number of bytes that this CodedInputStream will read
-   * before refusing to continue.
-   */
-  static const int kDefaultTotalBytesLimit = 64 << 20;  // 64MB
-
-  /**
-   * If data readed by the reader is more than 55MB( << 64MB),
-   * we reset the CodedInputStream object.
-   * This can help avoid 64MB warning which will cause the ParseFromCodedStream
-   * to fail.
-   */
-  static const int kMaxLimitBytes = 55 << 20;
-
-  /**
-   * This variable dosen't store the exact bytes readed by CodedInputStream
-   * object since which is constructed. Instead, it store the approximate bytes
-   * because we can't tell how many bytes are readed by the object with the
-   * help of API.
-   *
-   * @note this code depends on protobuf 2.4.0. There is nothing like
-   * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
-   * bytes has the object readed so far. Therefore, we calculated bytes
-   * ourselves.
-   */
-  int approximateReadedBytes_;
-};
-
-class ProtoWriter {
- public:
-  explicit ProtoWriter(std::ostream* s, bool dataCompression = false) {
-    CHECK(s) << "ostream pointer is nullptr";
-    ostreamOutput_.reset(new google::protobuf::io::OstreamOutputStream(s));
-    if (dataCompression) {
-      gzipOutput_.reset(
-          new google::protobuf::io::GzipOutputStream(ostreamOutput_.get()));
-      codedOutput_.reset(
-          new google::protobuf::io::CodedOutputStream(gzipOutput_.get()));
-    } else {
-      codedOutput_.reset(
-          new google::protobuf::io::CodedOutputStream(ostreamOutput_.get()));
-    }
-  }
-
-  /**
-   * write one message.
-   */
-  bool write(const google::protobuf::MessageLite& msg) {
-    codedOutput_->WriteVarint32(msg.ByteSize());
-    bool ret = msg.SerializeToCodedStream(codedOutput_.get());
-    return ret;
-  }
-
- protected:
-  std::unique_ptr<google::protobuf::io::ZeroCopyOutputStream> ostreamOutput_;
-  std::unique_ptr<google::protobuf::io::GzipOutputStream> gzipOutput_;
-  std::unique_ptr<google::protobuf::io::CodedOutputStream> codedOutput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
deleted file mode 100644
index 0827bd39d..000000000
--- a/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
+++ /dev/null
@@ -1,498 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PyDataProvider.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-#ifndef PADDLE_NO_PYTHON
-REGISTER_DATA_PROVIDER(py, PyDataProvider);
-#endif
-
-PyDataProvider::PyDataProvider(const DataConfig& config,
-                               bool useGpu,
-                               bool loadDataAll)
-    : DataProvider(config, useGpu), batchSize_(0) {
-  PyGuard guard;
-  pyModuleName_ = config_.load_data_module();
-  pyClassName_ = config_.load_data_object();
-  if (config_.load_data_args() != "") {
-    pyUserArgs_["load_data_args"] = config_.load_data_args();
-  }
-
-  if (loadDataAll) {
-    std::vector<std::string> fileList;
-    if (!config_.files().empty()) {
-      loadFileList(config_.files(), fileList);
-    }
-    loadData(fileList);
-  }
-}
-
-void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
-  VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
-  classInstance_ =
-      createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
-  CHECK(classInstance_) << "Create class instance failed.";
-  PyObjectPtr obj(PyObject_CallMethod(
-      classInstance_.get(), const_cast<char*>("getHeader"), NULL));
-  CHECK_PY(obj) << "Call function getHeader failed.";
-  std::string headerInfo =
-      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-  parseHeaderData(headerInfo);
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-}
-
-void PyDataProvider::parseHeaderData(const std::string& headerData) {
-  char* pHeader = const_cast<char*>(headerData.c_str());
-  char* pHeaderEnd = pHeader + headerData.size();
-  slotNum_ = readT<unsigned int>(pHeader, pHeaderEnd);
-  unsigned int useSequenceFlag = readT<unsigned int>(pHeader, pHeaderEnd);
-  isIID_ = useSequenceFlag != 1;
-  slots_.clear();
-  slots_.reserve(slotNum_);
-  for (size_t i = 0; i < slotNum_; ++i) {
-    unsigned int slotType = readT<unsigned int>(pHeader, pHeaderEnd);
-    unsigned int slotDim = readT<unsigned int>(pHeader, pHeaderEnd);
-    slots_.emplace_back();
-    slots_.back().dim = slotDim;
-    slots_.back().type = static_cast<SlotDef_SlotType>(slotType);
-  }
-}
-
-void PyDataProvider::resetSlots() {
-  for (auto& slot : slots_) {
-    slot.indexData.clear();
-    slot.denseData.clear();
-    slot.sparseNonValueData.clear();
-    slot.sparseFloatValueData.clear();
-    slot.indices.clear();
-    slot.sequenceStartPositions.clear();
-    slot.sampleSequenceIdVec.clear();
-    slot.subSequenceStartPositions.clear();
-    slot.strData.clear();
-  }
-}
-
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
-                                   char*& data,
-                                   const char* dataEnd) {
-  unsigned int dim = slot.dim;
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  slot.denseData.resize(slot.sampleNum * dim);
-#ifdef PADDLE_TYPE_DOUBLE
-  CHECK_LE(data + sizeof(real) * dim * slot.sampleNum, dataEnd)
-      << "std::copy data is out of range";
-  // PyDataProvider always provide data in float
-  float* dat = reinterpret_cast<float*>(data);
-  std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
-#else
-  memcpyWithCheck(slot.denseData.data(),
-                  data,
-                  sizeof(real) * dim * slot.sampleNum,
-                  dataEnd);
-#endif
-  // PyDataProvider always provide data in float
-  data += sizeof(float) * dim * slot.sampleNum;
-}
-
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
-                                            char*& data,
-                                            const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  unsigned int* indexPtr = (unsigned int*)data;
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign value is out of range";
-  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-  unsigned int length = 0;
-  length = readT<unsigned int>(data, dataEnd);
-  slot.indices.push_back(length);
-  slot.sparseNonValueData.resize(length);
-  memcpyWithCheck(slot.sparseNonValueData.data(),
-                  data,
-                  sizeof(unsigned int) * length,
-                  dataEnd);
-  data += sizeof(unsigned int) * length;
-}
-
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
-                                         char*& data,
-                                         const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  unsigned int* indexPtr = (unsigned int*)data;
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign value is out of range";
-  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-  unsigned int length = 0;
-  length = readT<unsigned int>(data, dataEnd);
-  unsigned int* colPtr = reinterpret_cast<unsigned int*>(data);
-  CHECK_LE(data + sizeof(unsigned int) * length, dataEnd)
-      << "Data is out of range";
-  data += sizeof(unsigned int) * length;
-  size_t colLen = readT<unsigned int>(data, dataEnd);
-  CHECK_EQ(colLen, length);
-  float* valuePtr = reinterpret_cast<float*>(data);
-  CHECK_LE(data + sizeof(real) * length, dataEnd) << "Data is out of range";
-  data += sizeof(real) * length;
-  slot.indices.push_back(length);
-  slot.sparseFloatValueData.resize(length);
-  for (unsigned int ii = 0; ii < length; ++ii) {
-    slot.sparseFloatValueData[ii].col = colPtr[ii];
-    slot.sparseFloatValueData[ii].value = valuePtr[ii];
-  }
-}
-
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
-                                   char*& data,
-                                   const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign is out of range";
-  slot.indexData.assign(reinterpret_cast<int*>(data),
-                        reinterpret_cast<int*>(data) + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-}
-
-void PyDataProvider::fillStringSlot(ProtoSlot& slot,
-                                    char*& data,
-                                    const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  for (unsigned int i = 0; i < slot.sampleNum; ++i) {
-    size_t len = readT<uint32_t>(data, dataEnd);
-    auto str_begin = data;
-    data += len;
-    CHECK_LE(data, dataEnd) << "Data is out of range";
-    slot.strData.emplace_back(str_begin, len);
-  }
-}
-
-void PyDataProvider::fillSlotsByStr(const std::string& samples) {
-  char* data = const_cast<char*>(samples.c_str());
-  char* dataEnd = data + samples.size();
-  batchSize_ = readT<unsigned int>(data, dataEnd);
-  if (0 == batchSize_) {
-    return;
-  }
-
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    CHECK(SlotDef::INDEX >= slot.type || SlotDef::STRING == slot.type)
-        << " Slot type:" << slot.type << " is out of range.";
-    CHECK_GE(slot.type, SlotDef::VECTOR_DENSE) << " Slot type:" << slot.type
-                                               << " is out of range.";
-    switch (slot.type) {
-      case SlotDef::VECTOR_DENSE:
-        fillDenseSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-        fillSparseNonValueSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VECTOR_SPARSE_VALUE:
-        fillSparseValueSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::INDEX:
-        fillIndexSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VAR_MDIM_DENSE:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::VAR_MDIM_INDEX:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::STRING:
-        fillStringSlot(slot, data, dataEnd);
-        break;
-    }
-  }
-  // read sequenceStartPositions
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    if (!iidData()) {
-      unsigned int sequenceNum = readT<unsigned int>(data, dataEnd);
-      slot.sequenceNum = sequenceNum;
-      for (size_t i = 0; i < sequenceNum; ++i) {
-        slot.sequenceStartPositions.push_back(
-            readT<unsigned int>(data, dataEnd));
-      }
-      for (size_t i = 0; i < sequenceNum; ++i) {
-        size_t begin = slot.sequenceStartPositions[i];
-        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
-                                           : slot.sampleNum;
-        for (size_t ii = begin; ii < end; ++ii) {
-          slot.sampleSequenceIdVec.push_back(ii);
-        }
-      }
-    } else {
-      for (size_t i = 0; i < slot.sampleNum; ++i) {
-        slot.sampleSequenceIdVec.push_back(i);
-      }
-    }
-  }
-  // read subSequenceStartPositions, not all slots have this infomation.
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    if (!iidData() && data != dataEnd) {
-      unsigned int subSequenceNum = readT<unsigned int>(data, dataEnd);
-      slot.subSequenceNum = subSequenceNum;
-      for (size_t i = 0; i < subSequenceNum; ++i) {
-        slot.subSequenceStartPositions.push_back(
-            readT<unsigned int>(data, dataEnd));
-      }
-    }
-  }
-}
-
-void PyDataProvider::reset() {
-  {  // Invoke PyDataProvider Reset
-    PyGuard guard;
-    PyObjectPtr obj(PyObject_CallMethod(
-        classInstance_.get(), const_cast<char*>("reset"), NULL));
-    CHECK_PY(obj) << "Call function reset failed.";
-  }
-
-  if (!skipShuffle_) {
-    // Invoke PyDataProvider Shuffle
-    shuffle();
-  }
-  DataProvider::reset();
-}
-
-void PyDataProvider::shuffle() {
-  // py shuffle
-  PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(
-      classInstance_.get(), const_cast<char*>("shuffle"), NULL));
-  CHECK_PY(obj) << "Call function shuffle failed.";
-}
-
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
-                                     size_t slotIndex,
-                                     std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
-                         slot.sampleNum,
-                         dim,
-                         false,   // trans = false
-                         false);  // useGpu = false
-  real* buf = cpuArguments[slotIndex].value->getData();
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    memcpyWithCheck(buf + i * dim,
-                    slot.denseData.data() + slot.sampleSequenceIdVec[i] * dim,
-                    sizeof(real) * dim,
-                    slot.denseData.data() + slot.denseData.size());
-  }
-}
-
-void PyDataProvider::handleSparseNonValueSlot(
-    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value =
-        Matrix::createSparseMatrix(slot.sampleNum,
-                                   dim,
-                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-                                   NO_VALUE,
-                                   SPARSE_CSR,
-                                   false,
-                                   useGpu_);
-  }
-  auto mat = cpuArguments[slotIndex].value;
-  mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
-  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseNonValueData.data(),
-        HPPL_STREAM_1);
-  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseNonValueData.data());
-  } else {
-    LOG(FATAL) << "Not Supported";
-  }
-}
-
-void PyDataProvider::handleSparseValueSlot(
-    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value =
-        Matrix::createSparseMatrix(slot.sampleNum,
-                                   dim,
-                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-                                   FLOAT_VALUE,
-                                   SPARSE_CSR,
-                                   false,
-                                   useGpu_);
-  }
-  auto mat = cpuArguments[slotIndex].value;
-  mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
-  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseFloatValueData.data(),
-        HPPL_STREAM_DEFAULT);
-  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseFloatValueData.data());
-  } else {
-    LOG(FATAL) << "Not Supported";
-  }
-}
-
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
-                                     size_t slotIndex,
-                                     std::vector<Argument>& cpuArguments) {
-  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
-                          slot.sampleNum,
-                          /*useGpu_*/ false);
-  int* buf = cpuArguments[slotIndex].ids->getData();
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    buf[i] = slot.indexData[slot.sampleSequenceIdVec[i]];
-  }
-}
-
-void PyDataProvider::handleStringSlot(ProtoSlot& slot,
-                                      size_t slotIndex,
-                                      std::vector<Argument>& cpuArguments) {
-  if (cpuArguments[slotIndex].strs) {
-    cpuArguments[slotIndex].strs->resize(slot.sampleNum);
-  } else {
-    cpuArguments[slotIndex].strs =
-        std::make_shared<std::vector<std::string>>(slot.sampleNum);
-  }
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    (*cpuArguments[slotIndex].strs)[i] =
-        slot.strData[slot.sampleSequenceIdVec[i]];
-  }
-}
-
-int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
-  PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("getNextBatch"),
-                                      const_cast<char*>("i"),
-                                      size));
-  CHECK_PY(obj) << "Call function getNextBatch failed.";
-  const std::string& samples =
-      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-  resetSlots();
-  fillSlotsByStr(samples);
-  size = batchSize_;
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(slotNum_);
-
-  if (!iidData()) {
-    for (size_t j = 0; j < slotNum_; ++j) {
-      auto& slot = slots_[j];
-      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
-                                    slot.sequenceNum + 1,
-                                    /* useGpu= */ false);
-      int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
-      std::copy(slot.sequenceStartPositions.begin(),
-                slot.sequenceStartPositions.end(),
-                buf);
-      buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
-
-      if (slot.subSequenceStartPositions.size()) {
-        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
-                                      slot.subSequenceNum + 1,
-                                      /*  useGpu= */ false);
-        int* buf =
-            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
-        std::copy(slot.subSequenceStartPositions.begin(),
-                  slot.subSequenceStartPositions.end(),
-                  buf);
-        buf[slot.subSequenceNum] = slot.sampleNum;
-        // check subSequenceStartPositions and sequenceStartPositions
-        cpuArguments[j].checkSubset();
-      }
-    }
-  }
-
-  for (size_t slotIndex = 0; slotIndex < slotNum_; ++slotIndex) {
-    auto& slot = slots_[slotIndex];
-    SlotDef::SlotType slotType = slot.type;
-    switch (slotType) {
-      case SlotDef::VECTOR_DENSE:
-        handleDenseSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-        handleSparseNonValueSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VECTOR_SPARSE_VALUE:
-        handleSparseValueSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::INDEX:
-        handleIndexSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VAR_MDIM_DENSE:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::VAR_MDIM_INDEX:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::STRING:
-        handleStringSlot(slot, slotIndex, cpuArguments);
-        break;
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (size_t i = 0; i < slotNum_; ++i) {
-      SlotDef::SlotType slotType = slots_[i].type;
-      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
-          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
-        gpuArguments[i] = cpuArguments[i];
-        gpuArguments[i].sequenceStartPositions =
-            cpuArguments[i].sequenceStartPositions;
-
-        if (slots_[i].subSequenceStartPositions.size()) {
-          gpuArguments[i].subSequenceStartPositions =
-              cpuArguments[i].subSequenceStartPositions;
-        }
-      } else {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  return batch->getSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.h b/paddle/legacy/gserver/dataproviders/PyDataProvider.h
deleted file mode 100644
index 4b8bea04a..000000000
--- a/paddle/legacy/gserver/dataproviders/PyDataProvider.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <paddle/legacy/utils/PythonUtil.h>
-#include "DataFormat.pb.h"
-#include "DataProvider.h"
-
-#include <vector>
-
-namespace paddle {
-
-class PyDataProvider : public DataProvider {
- public:
-  PyDataProvider(const DataConfig& config,
-                 bool useGpu,
-                 bool loadDataAll = true);
-
-  virtual void reset();
-
-  // Note this size includes the sequences which are skipped because they
-  // are longer than the batch size
-  virtual int64_t getSize() {
-    LOG(FATAL) << "Not implement yet";
-    return -1;
-  }
-  virtual void shuffle();
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
- protected:
-  struct ProtoSlot;
-  // return false if each each sample is one sequence, i.e., independent
-  // of other samples.
-  inline bool iidData() const { return isIID_; }
-
-  void parseHeaderData(const std::string& headerData);
-  void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSparseNonValueSlot(ProtoSlot& slot,
-                              char*& data,
-                              const char* dataEnd);
-  void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSlotsByStr(const std::string& samples);
-  void handleDenseSlot(ProtoSlot& slot,
-                       size_t slotIndex,
-                       std::vector<Argument>& cpuArguments);
-  void handleSparseNonValueSlot(ProtoSlot& slot,
-                                size_t slotIndex,
-                                std::vector<Argument>& cpuArguments);
-  void handleSparseValueSlot(ProtoSlot& slot,
-                             size_t slotIndex,
-                             std::vector<Argument>& cpuArguments);
-  void handleIndexSlot(ProtoSlot& slot,
-                       size_t slotIndex,
-                       std::vector<Argument>& cpuArguments);
-  void handleStringSlot(ProtoSlot& slot,
-                        size_t slotIndex,
-                        std::vector<Argument>& cpuArguments);
-  void resetSlots();
-  void loadData(const std::vector<std::string>& fileList);
-
- protected:
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    unsigned int sampleNum;
-    unsigned int sequenceNum;
-    unsigned int subSequenceNum;
-    // Store the data of index type slot
-    std::vector<int> indexData;
-    // Store the data of dense type slot
-    std::vector<real> denseData;
-    // Store the data of sparseNonValue type slot
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    // Store the data of sparseValue type slot
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    // Used to store the index of each sample in slot values
-    std::vector<int64_t> indices;
-    // The starting position of each sequence in samples
-    // The last element should be the number of samples
-    // If empty, each sample is one sequence.
-    std::vector<size_t> sequenceStartPositions;
-    // The index id of sequences in slot
-    std::vector<int64_t> sampleSequenceIdVec;
-    // The starting position of each subsequence in samples
-    // The last element should be the number of subsequence
-    // If empty, each sequence of sample has no subsequence.
-    std::vector<size_t> subSequenceStartPositions;
-    // Store the data of string type slot
-    std::vector<std::string> strData;
-  };
-  std::vector<ProtoSlot> slots_;
-
-  PyObjectPtr classInstance_;
-  unsigned int batchSize_;
-  unsigned int slotNum_;
-  // if use sequence, isIID_ equals false, otherwise it is true.
-  bool isIID_;
-  // The name of python module name
-  std::string pyModuleName_;
-  // The name of python class name
-  std::string pyClassName_;
-  // User args set in config
-  std::map<std::string, std::string> pyUserArgs_;
-
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
deleted file mode 100644
index 8e931e406..000000000
--- a/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
+++ /dev/null
@@ -1,1031 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-
-#include <Python.h>
-#include <numpy/numpyconfig.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <list>
-#include <unordered_set>
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <numpy/ndarrayobject.h>
-
-#include "DataProvider.h"
-
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-namespace unittest {
-
-static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
-    OnPoolFilled;
-
-namespace pydp2 {
-
-void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
-  OnPoolFilled.reset(new std::function<void(size_t)>());
-  *OnPoolFilled = callback;
-}
-
-void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
-
-}  // namespace pydp2
-}  // namespace unittest
-
-/**
- * Slot type
- */
-enum SlotType {
-  ST_DENSE = 0,
-  ST_NON_SPARSE_VALUE = 1,
-  ST_SPARSE_VALUE = 2,
-  ST_INDEX = 3
-};
-
-/**
- * Sequence type
- */
-enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
-
-/**
- * Cache Type.
- */
-enum CacheType {
-  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
-  CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
-                          // then cache all data in memory. Load data from
-                          // memory in rest passes.
-};
-
-struct SlotHeader {  // Slot Header will parse from python object's slots field.
-  size_t dim;
-  SlotType slotType;
-  SeqType seqType;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
-  os << "Dim = " << header.dim << " Type = " << header.slotType
-     << " SeqType = " << header.seqType;
-  return os;
-}
-
-/**
- * FieldScanner Interface.
- *
- * It will read python object, and fill to argument's each slot.
- * There are two steps, prepare and fill. Scanner will alloc memory during
- * prepare step, fill data into argument during fill step.
- */
-class IFieldScanner {
- public:
-  DISABLE_COPY(IFieldScanner);
-  /**
-   * Ctor.
-   * @param headerPtr slot header that scanner belong to.
-   */
-  explicit IFieldScanner(SlotHeader* headerPtr) : headerPtr_(headerPtr) {}
-  virtual ~IFieldScanner() {}
-
-  /**
-   * Start prepare step.
-   */
-  virtual void startPrepare(Argument& argument) {}
-
-  /**
-   * Prepare step.
-   *
-   * @note the obj could be a timestep of sample or whole sample. It depends
-   * what scanner it is.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {}
-
-  /**
-   * Finish Prepare step.
-   */
-  virtual void finishPrepare(Argument& argument) {}
-
-  /**
-   * Start fill step.
-   */
-  virtual void startFill(Argument& argument) {}
-
-  /**
-   * Fill step.
-   *
-   * @note the obj could be a timestep of sample or whole sample. It depends
-   * what scanner it is.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {}
-
-  /**
-   * Finish fill step.
-   */
-  virtual void finishFill(Argument& argument) {}
-
-  /**
-   * Factory method. Create a scanner by header. The final scanner may be
-   * combine many scanners.
-   *
-   * @note Fatal if header is not support.
-   */
-  static IFieldScanner* create(SlotHeader* header);
-
- protected:
-  SlotHeader* headerPtr_;
-};
-
-/**
- * Py Data Provider Cache Interface.
- */
-class IPyDataProviderCache {
- public:
-  virtual ~IPyDataProviderCache() {}
-
-  /**
-   * invoke when DataProvider::reset()
-   * @return true if read data from python.
-   */
-  virtual bool reset() = 0;
-
-  /**
-   * invoke when these data are used by DataProvider, and need to clear.
-   * @param [inout] data used data.
-   *
-   * @note The implemented class must clear these data array. Or if you want to
-   * delete the PyObjectPtr later, you should make sure the paddle process only
-   * have one active thread calling python code (use PyGuard otherwise).
-   */
-  virtual void drop(std::deque<PyObjectPtr>* data) = 0;
-
-  /**
-   * Return whole data in cache.
-   */
-  virtual std::deque<PyObjectPtr>* load() = 0;
-
-  /**
-   * Factory method. Convert CacheType to IPyDataProviderCache*
-   */
-  static IPyDataProviderCache* create(CacheType ct);
-};
-
-/**
- * PyDataProvider2.
- *
- * For usage, please refer python module 'paddle.trainer.PyDataProvider2'
- *
- * Here, we start a thread to read data. It is totally asynchronous for reading
- * data. And it support cache strategies.
- */
-class PyDataProvider2 : public DataProvider {
- public:
-  /**
-   * Ctor
-   */
-  PyDataProvider2(const DataConfig& config,
-                  const ModelConfig& modelConfig,
-                  bool useGpu)
-      : DataProvider(config, useGpu), callingContextCreated_(2) {
-    if (PyArray_API == NULL) import_array();
-    auto& args = config.load_data_args();
-    PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
-    if (!args.empty()) {
-      kwargs = callPythonFuncRetPyObj(
-          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
-    }
-
-    py::DictHelper kwargsDict(kwargs);
-    kwargsDict.setBool("is_train", !config.for_test());
-    std::vector<std::string> inputs;
-    inputs.reserve(modelConfig.input_layer_names().size());
-    std::copy(modelConfig.input_layer_names().begin(),
-              modelConfig.input_layer_names().end(),
-              std::back_inserter(inputs));
-    kwargsDict.setStringList("input_order", inputs);
-
-    // kwargs is keyword arguemts to create object.
-    this->createPyDataObj(config.load_data_module(),
-                          config.load_data_object(),
-                          config.files(),
-                          std::move(kwargs));
-    DBG << "Instance " << instance_.get() << " loaded.";
-    this->readPyFields(config.for_test());
-    DBG << "Py Field Done";
-  }
-
-  /**
-   * Dtor
-   * @note will stop loading thread when destructing
-   */
-  virtual ~PyDataProvider2() { resetImpl(false); }
-
- private:
-  void createPyDataObj(const std::string& model,
-                       const std::string& className,
-                       const std::string& fileListName,
-                       PyObjectPtr&& kwargs  // NOLINT
-                       ) {
-    LOG(INFO) << "loading dataprovider " << model << "::" << className;
-
-    PyObjectPtr module = py::import(model);
-    PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
-    CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
-    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
-    CHECK_PY(cls) << "load class " << className.c_str() << "error";
-
-    // If there are multiple python instance share same module, the PyObjectPtr
-    // only for instance will make python reference-count error.
-    //
-    // So here, we increase reference count manually.
-    Py_XINCREF(module.get());
-    Py_XINCREF(moduleDict.get());
-    Py_XINCREF(cls.get());
-
-    PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
-    PyDict_SetItemString(kwargs.get(), "file_list", fileListInPy.get());
-    {
-      PyGuard guard;
-      instance_.reset(PyObject_Call(cls.get(), zeroTuple_.get(), kwargs.get()));
-    }
-    CHECK_PY(instance_) << "Cannot Create instance";
-  }
-
-  void readPyFields(bool testing) {
-    py::ObjectHelper self(this->instance_);
-    bool ok;
-
-    this->skipShuffle_ =
-        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
-    if (!ok) {
-      this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
-                                     // when is testing.
-    }
-    DBG << "Provider Skip Shuffle " << this->skipShuffle_;
-
-    this->poolSize_ = self.getIntAttr<size_t>("pool_size", &ok);
-    if (!ok) {
-      this->poolSize_ = -1UL;
-    }
-    this->minPoolSize_ = self.getIntAttr<size_t>("min_pool_size", &ok);
-    if (!ok) {
-      this->minPoolSize_ = -1UL;
-    }
-    this->minPoolSize_ = std::min(this->poolSize_, this->minPoolSize_);
-
-    this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size");
-
-    calcBatchSize_.reset(self.getAttr("calc_batch_size"));
-    if (this->calcBatchSize_ && !py::isCallable(this->calcBatchSize_)) {
-      this->calcBatchSize_.reset();
-    }
-
-    generator_.reset(self.getAttr("generator"));
-    CHECK(py::isCallable(generator_));
-
-    // Reading slots.
-    PyObjectPtr slotsPtr(self.getAttr("slots"));
-    py::SequenceHelper slots(slotsPtr);
-    headers_.reserve(slots.size());
-    for (size_t i = 0; i < slots.size(); ++i) {
-      headers_.emplace_back();
-      auto& header = headers_.back();
-      PyObject* hdPtr = slots[i];
-      CHECK(hdPtr != nullptr);
-      Py_XINCREF(hdPtr);
-      PyObjectPtr headerPtrWrap(hdPtr);
-      py::ObjectHelper hd(headerPtrWrap);
-      header.dim = hd.getIntAttrWithError<size_t>("dim");
-      header.seqType = (SeqType)hd.getIntAttrWithError<int>("seq_type");
-      header.slotType = (SlotType)hd.getIntAttrWithError<int>("type");
-    }
-
-    DBG << "Data header size " << headers_.size();
-    for (auto& header : headers_) {
-      DBG << header;
-    }
-    cache_.reset(IPyDataProviderCache::create(
-        (CacheType)self.getIntAttrWithError<int>("cache")));
-  }
-
-  PyObjectPtr loadPyFileLists(const std::string& fileListName) {
-    loadFileList(fileListName, fileLists_);
-    PyObject* lst = PyList_New(fileLists_.size());
-    for (size_t i = 0; i < fileLists_.size(); ++i) {
-      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
-    }
-    return PyObjectPtr(lst);
-  }
-
-  void loadThread() {
-    DBG << "Creating context";
-    for (auto& filename : fileLists_) {
-      PyGuard g;
-      py::CallableHelper generator(this->generator_);
-      generator.setArgsSize(2);
-      generator.getArgs().set(0, instance_);
-      generator.getArgs().set(1, PyString_FromString(filename.c_str()), true);
-      callingContexts_.emplace_back(generator());
-      CHECK_PY(callingContexts_.back()) << "Generator error.";
-      CHECK(PyIter_Check(callingContexts_.back()));
-    }
-    DBG << "Create context done";
-    callingContextCreated_.wait();
-
-    PositionRandom p(skipShuffle_);
-
-    while (!exit_ && !callingContexts_.empty()) {
-      PyObject* data = nullptr;
-
-      {  // Read data.
-        size_t cid = p(callingContexts_.size());
-        bool atEnd;
-        data = py::iterNext(callingContexts_[cid], &atEnd);
-        if (atEnd || data == nullptr) {
-          if (cid != 0) {
-            std::swap(callingContexts_[cid], callingContexts_[0]);
-            cid = 0;
-          }
-
-          PyObjectPtr front;
-          {
-            std::unique_lock<std::mutex> l(mtx_);
-            front = pop_get_front(callingContexts_);
-          }
-          {
-            PyGuard g;
-            front.reset();
-          }
-          this->pullCV_.notify_all();
-          continue;
-        }
-      }
-
-      size_t additionalBatchSize = 1;
-      if (calcBatchSize_) {
-        PyGuard guard;
-        py::CallableHelper calcBatchSize(this->calcBatchSize_);
-        calcBatchSize.setArgsSize(1);
-        calcBatchSize.getArgs().set(0, data);
-        PyObjectPtr bs(calcBatchSize());
-        CHECK_PY(bs);
-        bool ok;
-        additionalBatchSize = py::castInt<size_t>(bs.get(), &ok);
-        CHECK(ok) << "CalcBatchSize must return int or long";
-      }
-
-      if (this->loadThread_) {  // wait poolActualSize < poolSize;
-        std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l, [this] { return this->poolActualSize_ < poolSize_; });
-      }
-
-      {
-        std::lock_guard<std::mutex> guard(mtx_);
-        poolActualSize_ += additionalBatchSize;
-        dataPool_.emplace_back(data);
-      }
-      pullCV_.notify_all();
-    }
-    DBG << "load thread end";
-  }
-
-  inline void resetImpl(bool startNewThread) {
-    DBG << "Reseting " << startNewThread;
-    exit_.store(true);
-    if (loadThread_) {  // is loading.
-      loadThread_->join();
-      loadThread_.reset();
-    }
-    {
-      PyGuard g;
-      callingContexts_.clear();
-      this->pullCV_.notify_one();
-    }
-
-    std::lock_guard<std::mutex> guard(mutexForReset_);
-    {
-      PyGuard g;
-      dataPool_.clear();
-    }
-    poolActualSize_ = 0;
-
-    if (startNewThread && cache_->reset()) {
-      DBG << "Start new thread.";
-      loadThread_.reset(new std::thread([this] {
-        exit_ = false;
-        loadThread();
-      }));
-      callingContextCreated_.wait();
-    }
-    DBG << "Reset done";
-    exit_ = false;
-  }
-
- private:
-  std::unique_ptr<std::thread> loadThread_;
-  std::atomic<bool> exit_;
-  std::deque<PyObjectPtr> callingContexts_;
-  std::deque<PyObjectPtr> dataPool_;
-  size_t poolActualSize_;
-  std::condition_variable pushCV_;
-  std::condition_variable pullCV_;
-  std::mutex mtx_;
-
-  std::mutex mutexForReset_;
-
-  ThreadBarrier callingContextCreated_;
-  std::unique_ptr<IPyDataProviderCache> cache_;
-
-  PyObjectPtr instance_;
-  size_t poolSize_;
-  size_t minPoolSize_;
-  bool canOverBatchSize_;
-  PyObjectPtr calcBatchSize_;
-  PyObjectPtr generator_;
-  std::vector<std::string> fileLists_;
-  std::vector<SlotHeader> headers_;
-  static PyObjectPtr zeroTuple_;
-
-  class PositionRandom {
-   public:
-    inline explicit PositionRandom(bool skipRand)
-        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
-
-    inline size_t operator()(size_t len) {
-      if (!skipRand_) {
-        if (!dist_ || dist_->b() != len - 1) {
-          dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
-        }
-        return (*dist_)(eng_);
-      } else {
-        return 0;
-      }
-    }
-
-   private:
-    std::default_random_engine& eng_;
-    std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
-    bool skipRand_;
-  };
-
-  // DataProvider interface
- public:
-  /**
-   * Resetting the PyDataProvider. May start reading thread here.
-   */
-  virtual void reset() {
-    resetImpl(true);
-    DataProvider::reset();
-  }
-
-  /**
-   * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
-   * select data from datapool.
-   */
-  void shuffle() {}
-
-  /**
-   * Not limited size.
-   */
-  int64_t getSize() { return -1; }
-
-  /**
-   * Loading a batch of data.
-   */
-  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
-    std::lock_guard<std::mutex> guard(mutexForReset_);
-    REGISTER_TIMER("PyDP2.getNextBatchInternal")
-    CHECK_GE(size_, 0);
-    size_t size = (size_t)size_;
-    if (loadThread_) {  // loading from thread should wait for data pool ready.
-                        // but, loading from cache, cache object should ensure
-                        // data pool ready.
-      std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= std::max(size, this->minPoolSize_) ||
-               callingContexts_.empty();
-      });
-
-      if (unittest::OnPoolFilled) {
-        (*unittest::OnPoolFilled)(this->poolActualSize_);
-      }
-    }
-    std::deque<PyObjectPtr> data;
-    size_t bsize = 0;
-    std::deque<PyObjectPtr>* poolPtr = nullptr;
-
-    if (this->loadThread_) {  // loading from thread.
-      poolPtr = &this->dataPool_;
-    } else {  // loading from cache.
-      poolPtr = this->cache_->load();
-    }
-    if (exit_) {
-      // PyDataProvider is destructing.
-      return 0;
-    }
-    CHECK(poolPtr != nullptr);
-
-    std::deque<PyObjectPtr>& pool = *poolPtr;
-
-    while (bsize < size && !pool.empty()) {
-      {
-        // move data from pool to data
-        std::lock_guard<std::mutex> guard(mtx_);
-        if (skipShuffle_) {
-          size_t i = 0;
-          CHECK(pool[i] != nullptr);
-          data.emplace_back(std::move(pool[i]));
-          pool.pop_front();
-        } else {  // when shuffle, use swap to drop only last pool element.
-          size_t i = ThreadLocalRand::rand() % pool.size();
-          CHECK(pool[i] != nullptr);
-          if (i != 0) {
-            std::swap(pool[i], pool.front());
-          }
-          data.emplace_back(std::move(pool.front()));
-          pool.pop_front();
-        }
-
-        if (calcBatchSize_) {  // custom calc batch size.
-          PyGuard guard;
-          Py_INCREF(data.back().get());
-          py::CallableHelper calcBatchSize(calcBatchSize_);
-          calcBatchSize.setArgsSize(1);
-          calcBatchSize.getArgs().set(0, data.back());
-          PyObjectPtr customBatchSize(calcBatchSize());
-          bool ok;
-          size_t tmp = py::castInt<size_t>(customBatchSize.get(), &ok);
-          CHECK(ok) << "calc_batch_size must return int";
-
-          if (bsize + tmp > size && !canOverBatchSize_) {
-            // Put data back.
-            pool.push_front(std::move(data.back()));
-            data.pop_back();
-            break;
-          } else {
-            bsize += tmp;
-          }
-        } else {
-          bsize += 1;
-        }
-      }
-    }
-
-    if (this->loadThread_) {
-      {
-        std::lock_guard<std::mutex> g(mtx_);
-        poolActualSize_ -= bsize;
-      }
-      this->pushCV_.notify_all();
-    }
-
-    if (bsize == 0) {  // end of pass. In data pool, cannot get any data.
-      return 0;
-    }
-
-    DataBatch cpuBatch;
-    cpuBatch.setSize(bsize);
-    auto& inArgs = cpuBatch.getStreams();
-    inArgs.resize(headers_.size());
-    std::vector<std::unique_ptr<IFieldScanner>> scanners;
-    scanners.reserve(headers_.size());
-    for (auto& header : headers_) {
-      scanners.emplace_back(IFieldScanner::create(&header));
-    }
-    DBG << "Scanner created.";
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->startPrepare(inArgs[i]);
-    }
-    for (auto& d : data) {
-      py::SequenceHelper s(d);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        scanners[i]->prepare(inArgs[i], s[i]);
-      }
-    }
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->finishPrepare(inArgs[i]);
-    }
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->startFill(inArgs[i]);
-    }
-    for (auto& d : data) {
-      py::SequenceHelper s(d);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        scanners[i]->fill(inArgs[i], s[i]);
-      }
-    }
-
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->finishFill(inArgs[i]);
-    }
-
-    {
-      PyGuard g;
-      cache_->drop(&data);
-    }
-
-    DBG << "Reading CPU Batch Done.";
-
-    if (useGpu_) {
-      std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-      DataBatch& gpuBatch = *batch;
-      std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-      gpuArguments.resize(cpuArguments.size());
-      gpuBatch.setSize(bsize);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-      hl_stream_synchronize(HPPL_STREAM_1);
-    } else {
-      *batch = cpuBatch;
-    }
-    return bsize;
-  }
-};
-
-PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
-
-REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
-
-/**
- * Scanner for dense slot.
- */
-class DenseScanner : public IFieldScanner {
- public:
-  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
-
-  /**
-   * Prepare.
-   * @param argument target argument
-   * @param obj each timestep of a sample.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreate(
-        argument.value, height_, headerPtr_->dim, false, false);
-    height_ = 0;
-  }
-
-  /**
-   * Fill argument from obj.
-   * @param argument
-   * @param obj
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    real* dat = argument.value->getData() + height_ * headerPtr_->dim;
-    if (PyArray_Check(obj)) {
-      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
-        auto sz = PyArray_SIZE((PyArrayObject*)obj);
-        std::copy(data, data + sz, dat);
-      } else {
-        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
-      }
-    } else {
-      py::SequenceHelper s(obj);
-      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
-      for (size_t i = 0; i < headerPtr_->dim; ++i) {
-        dat[i] = (real)s.getDouble(i);
-      }
-    }
-    ++height_;
-  }
-
- private:
-  size_t height_;
-};
-
-/**
- * Scanner for index slot
- */
-class IndexScanner : public IFieldScanner {
- public:
-  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
-
-  /**
-   * Prepare memory space.
-   *
-   * @note obj is a single timestep of sample
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
-
-  virtual void finishPrepare(Argument& argument) {
-    IVector::resizeOrCreate(argument.ids, cnt_, false);
-    cnt_ = 0;
-  }
-
-  /**
-   * Fill one index to argument.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    bool ok;
-    argument.ids->getData()[cnt_++] = py::castInt<int>(obj, &ok);
-    CHECK(ok) << "Cannot cast int " << py::repr(obj);
-  }
-
- private:
-  size_t cnt_;
-};
-
-class SparseNonValueScanner : public IFieldScanner {
- public:
-  explicit SparseNonValueScanner(SlotHeader* ptr)
-      : IFieldScanner(ptr), nnz_(0), height_(0) {}
-
-  /**
-   * Prepare memory space
-   * @note obj is a timestep of one sample.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {
-    ++height_;
-    nnz_ += py::SequenceHelper(obj).size();
-  }
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreateSparseMatrix(
-        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
-  }
-
-  virtual void startFill(Argument& argument) {
-    auto smat = (CpuSparseMatrix*)(argument.value.get());
-    smat->getRows()[0] = 0;
-    nnz_ = 0;
-    height_ = 1;
-  }
-
-  /**
-   * Fill one sparse vector to argument.
-   * @note obj is a timestep of one sample.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    auto sz = s.size();
-    auto smat = (CpuSparseMatrix*)(argument.value.get());
-    int* row = smat->getRows();
-    int* col = smat->getCols();
-    real* dat = smat->getData();
-    row[height_] = row[height_ - 1] + (int)sz;
-
-    for (decltype(sz) i = 0; i < sz; ++i) {
-      setData(col + nnz_, dat + nnz_, s[i]);
-      ++nnz_;
-    }
-    ++height_;
-  }
-
- protected:
-  /**
-   * Set a single sparse index and value.
-   * @param [out] col sparse index
-   * @param [out] dat sparse value
-   * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
-   *                 For sparse_value is a Tuple (int, float).
-   */
-  virtual void setData(int* col, real* dat, PyObject* obj) {
-    bool ok;
-    *col = py::castInt<int>(obj, &ok);
-    CHECK(ok);
-  }
-
-  size_t nnz_;
-  size_t height_;
-};
-
-class SparseValueScanner : public SparseNonValueScanner {
- public:
-  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreateSparseMatrix(
-        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
-  }
-
- protected:
-  virtual void setData(int* col, real* dat, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    SparseNonValueScanner::setData(col, dat, s[0]);
-    *dat = (real)s.getDouble(1);
-  }
-};
-
-/**
- * Sequence Scanner. Scanner for sequence or sub-sequence.
- */
-class SequenceScanner : public IFieldScanner {
- public:
-  /**
-   * Ctor
-   * @param innerScanner inner scanner for each timestep or sub-sequence.
-   * @param getSeqStartPos A callback, (Argument) => ICpuGpuVectorPtr.
-   *                       return a sequence start position or a sub-sequence
-   *                       start position.
-   */
-  SequenceScanner(
-      std::unique_ptr<IFieldScanner>&& innerScanner,
-      const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
-      : IFieldScanner(nullptr),
-        inner_(std::move(innerScanner)),
-        cnt_(0),
-        getSeqStartPos_(getSeqStartPos) {}
-
-  /**
-   * Start prepare. Invoke inner->startPrepare too.
-   */
-  virtual void startPrepare(Argument& argument) {
-    inner_->startPrepare(argument);
-  }
-
-  /**
-   * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
-   * element of sequence obj.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    ++cnt_;
-    for (size_t i = 0; i < s.size(); ++i) {
-      inner_->prepare(argument, s[i]);
-    }
-  }
-
-  /**
-   * Finish prepare. invoke inner_->finishPrepare too.
-   */
-  virtual void finishPrepare(Argument& argument) {
-    ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
-    inner_->finishPrepare(argument);
-  }
-
-  /**
-   * Start fill. invoke inner->startFill too.
-   */
-  virtual void startFill(Argument& argument) {
-    getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
-    cnt_ = 1;
-    inner_->startFill(argument);
-  }
-
-  /**
-   * Fill. Obj is a tuple or list. invoke inner->fill for each element of
-   * sequence obj. And set seqStartPos at same time. The seqStartPos will be
-   * calculated by getSeqStartPos callback passed in ctor.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
-        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
-        (int)getSize(obj);
-    py::SequenceHelper s(obj);
-    ++cnt_;
-    for (size_t i = 0; i < s.size(); ++i) {
-      inner_->fill(argument, s[i]);
-    }
-  }
-
-  /**
-   * Finish fill. will invoke inner->finishFill too.
-   */
-  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
-
- protected:
-  size_t getSize(PyObject* obj) {
-    py::SequenceHelper s(obj);
-    auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
-    if (sc) {
-      size_t sum = 0;
-      for (size_t i = 0; i < s.size(); ++i) {
-        sum += sc->getSize(s[i]);
-      }
-      return sum;
-    } else {
-      return s.size();
-    }
-  }
-
- private:
-  std::unique_ptr<IFieldScanner> inner_;
-  size_t cnt_;
-  std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
-};
-
-IFieldScanner* IFieldScanner::create(SlotHeader* header) {
-  IFieldScanner* retv = nullptr;
-  switch (header->slotType) {
-    case ST_DENSE:
-      retv = new DenseScanner(header);
-      break;
-    case ST_INDEX:
-      retv = new IndexScanner(header);
-      break;
-    case ST_NON_SPARSE_VALUE:
-      retv = new SparseNonValueScanner(header);
-      break;
-    case ST_SPARSE_VALUE:
-      retv = new SparseValueScanner(header);
-      break;
-    default:
-      LOG(FATAL) << "Not implemented " << header->slotType;
-  }
-
-  switch (header->seqType) {
-    case SQT_NONE:
-      break;
-    case SQT_SUBSEQ:
-      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
-                                   return arg.subSequenceStartPositions;
-                                 });
-    // fall through, not break;
-    case SQT_SEQ:
-      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
-                                   return arg.sequenceStartPositions;
-                                 });
-      break;
-    default:
-      LOG(FATAL) << "Not implemented";
-  }
-
-  return retv;
-}
-
-/**
- * No Cache Strategy. Will destruct old data immediately and load data from
- * python every pass.
- */
-class NoCacheStrategy : public IPyDataProviderCache {
- public:
-  virtual bool reset() { return true; }
-
-  virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
-
-  virtual std::deque<PyObjectPtr>* load() { return nullptr; }
-};
-
-/**
- * Cache One Pass In Memory strategy.
- *
- * In first pass, will load data from python and store them in memory.
- * The rest passes, will load data from memory.
- */
-class CacheOnePassInMemory : public IPyDataProviderCache {
- public:
-  CacheOnePassInMemory()
-      : objPool_(new std::deque<PyObjectPtr>()),
-        droppedPool_(new std::deque<PyObjectPtr>()) {}
-
-  virtual bool reset() {
-    if (objPool_->empty() && droppedPool_->empty()) {
-      return true;
-    } else if (objPool_->empty()) {
-      std::swap(objPool_, droppedPool_);
-      return false;
-    } else {
-      LOG(FATAL) << "Unexpected branch";
-    }
-  }
-
-  virtual void drop(std::deque<PyObjectPtr>* data) {
-    size_t orgSize = droppedPool_->size();
-    droppedPool_->resize(orgSize + data->size());
-    for (size_t i = 0; i < data->size(); ++i) {
-      std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
-    }
-    data->clear();
-  }
-
-  virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
-
- private:
-  std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
-  std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
-};
-
-IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
-  switch (ct) {
-    case NO_CACHE:
-      return new NoCacheStrategy();
-    case CACHE_PASS_IN_MEM:
-      return new CacheOnePassInMemory();
-    default:
-      LOG(FATAL) << "Not implemented";
-  }
-}
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
deleted file mode 100644
index c145adda5..000000000
--- a/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-namespace paddle {
-
-/**
- * calculate sequence-to-sequence edit distance
- */
-class CTCErrorEvaluator : public Evaluator {
- private:
-  MatrixPtr outActivations_;
-  int numTimes_, numClasses_, numSequences_, blank_;
-  real deletions_, insertions_, substitutions_;
-  int seqClassficationError_;
-  mutable std::unordered_map<std::string, real> evalResults_;
-
-  std::vector<int> path2String(const std::vector<int>& path) {
-    std::vector<int> str;
-    str.clear();
-    int prevLabel = -1;
-    for (std::vector<int>::const_iterator label = path.begin();
-         label != path.end();
-         label++) {
-      if (*label != blank_ &&
-          (str.empty() || *label != str.back() || prevLabel == blank_)) {
-        str.push_back(*label);
-      }
-      prevLabel = *label;
-    }
-    return str;
-  }
-
-  std::vector<int> bestLabelSeq() {
-    std::vector<int> path;
-    path.clear();
-    real* acts = outActivations_->getData();
-    for (int i = 0; i < numTimes_; ++i) {
-      path.push_back(std::max_element(acts + i * numClasses_,
-                                      acts + (i + 1) * numClasses_) -
-                     (acts + i * numClasses_));
-    }
-    return path2String(path);
-  }
-
-  /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
-   * insertion"
-   * in edit-distance error */
-  real stringAlignment(std::vector<int>& gtStr,
-                       std::vector<int>& recogStr,
-                       bool backtrace = true,
-                       real sp = 1.0,
-                       real dp = 1.0,
-                       real ip = 1.0) {
-    std::vector<std::vector<int>> matrix;
-    int substitutions, deletions, insertions;
-    real distance;
-    int n = gtStr.size();
-    int m = recogStr.size();
-
-    if (n == 0) {
-      substitutions = 0;
-      deletions = 0;
-      insertions = m;
-      distance = m;
-    } else if (m == 0) {
-      substitutions = 0;
-      deletions = n;
-      insertions = 0;
-      distance = n;
-    } else {
-      substitutions = 0;
-      deletions = 0;
-      insertions = 0;
-      distance = 0;
-      // initialize the matrix
-      matrix.resize(n + 1);
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i].resize(m + 1);
-        for (int j = 0; j < m + 1; ++j) {
-          matrix[i][j] = 0;
-        }
-      }
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i][0] = i;
-      }
-      for (int j = 0; j < m + 1; ++j) {
-        matrix[0][j] = j;
-      }
-
-      // calculate the insertions, substitutions and deletions
-      for (int i = 1; i < n + 1; ++i) {
-        int s_i = gtStr[i - 1];
-        for (int j = 1; j < m + 1; ++j) {
-          int t_j = recogStr[j - 1];
-          int cost = (s_i == t_j) ? 0 : 1;
-          const int above = matrix[i - 1][j];
-          const int left = matrix[i][j - 1];
-          const int diag = matrix[i - 1][j - 1];
-          const int cell = std::min(above + 1, std::min(left + 1, diag + cost));
-          matrix[i][j] = cell;
-        }
-      }
-
-      if (backtrace) {
-        size_t i = n;
-        size_t j = m;
-        substitutions = 0;
-        deletions = 0;
-        insertions = 0;
-
-        while (i != 0 && j != 0) {
-          if (matrix[i][j] == matrix[i - 1][j - 1]) {
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j - 1] + 1) {
-            ++substitutions;
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j] + 1) {
-            ++deletions;
-            --i;
-          } else {
-            ++insertions;
-            --j;
-          }
-        }
-        while (i != 0) {
-          ++deletions;
-          --i;
-        }
-        while (j != 0) {
-          ++insertions;
-          --j;
-        }
-        int diff = substitutions + deletions + insertions;
-        if (diff != matrix[n][m]) {
-          LOG(ERROR) << "Found path with distance " << diff
-                     << " but Levenshtein distance is " << matrix[n][m];
-        }
-
-        distance = (sp * substitutions) + (dp * deletions) + (ip * insertions);
-      } else {
-        distance = (real)matrix[n][m];
-      }
-    }
-    real maxLen = std::max(m, n);
-    deletions_ += deletions / maxLen;
-    insertions_ += insertions / maxLen;
-    substitutions_ += substitutions / maxLen;
-
-    if (distance != 0) {
-      seqClassficationError_ += 1;
-    }
-
-    return distance / maxLen;
-  }
-
-  real editDistance(
-      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
-    numTimes_ = numTimes;
-    numClasses_ = numClasses;
-    blank_ = numClasses_ - 1;
-    outActivations_ = Matrix::create(output, numTimes, numClasses);
-    std::vector<int> recogStr, gtStr;
-    recogStr = bestLabelSeq();
-    for (int i = 0; i < labelsLen; ++i) {
-      gtStr.push_back(labels[i]);
-    }
-
-    return stringAlignment(gtStr, recogStr);
-  }
-
-  void storeLocalValues() const {
-    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
-    evalResults_["deletion_error"] =
-        numSequences_ ? deletions_ / numSequences_ : 0;
-    evalResults_["insertion_error"] =
-        numSequences_ ? insertions_ / numSequences_ : 0;
-    evalResults_["substitution_error"] =
-        numSequences_ ? substitutions_ / numSequences_ : 0;
-    evalResults_["sequence_error"] =
-        (real)seqClassficationError_ / numSequences_;
-  }
-
- public:
-  CTCErrorEvaluator()
-      : numTimes_(0),
-        numClasses_(0),
-        numSequences_(0),
-        blank_(0),
-        deletions_(0),
-        insertions_(0),
-        substitutions_(0),
-        seqClassficationError_(0) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
-    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    CHECK(label.sequenceStartPositions);
-    CHECK(label.ids);
-    size_t numSequences = label.sequenceStartPositions->getSize() - 1;
-    const int* labelStarts = label.sequenceStartPositions->getData(false);
-    const int* outputStarts = output.sequenceStartPositions->getData(false);
-    real totalErr = 0;
-    for (size_t i = 0; i < numSequences; ++i) {
-      real err = 0;
-      err = editDistance(
-          output.value->getData() + output.value->getWidth() * outputStarts[i],
-          outputStarts[i + 1] - outputStarts[i],
-          output.value->getWidth(),
-          label.ids->getData() + labelStarts[i],
-          labelStarts[i + 1] - labelStarts[i]);
-
-      totalErr += err;
-    }
-
-    return totalErr;
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    Evaluator::eval(nn);
-    std::vector<Argument> arguments;
-    arguments.reserve(config_.input_layers_size());
-    for (const std::string& name : config_.input_layers()) {
-      arguments.push_back(nn.getLayer(name)->getOutput());
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSequences_ += arguments[1].getNumSequences();
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numSequences_ = 0;
-    blank_ = 0;
-    deletions_ = 0;
-    insertions_ = 0;
-    substitutions_ = 0;
-    seqClassficationError_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << " error = " << evalResults_["error"];
-    os << " deletions error = " << evalResults_["deletion_error"];
-    os << " insertions error = " << evalResults_["insertion_error"];
-    os << " substitution error = " << evalResults_["substitution_error"];
-    os << " sequence error = " << evalResults_["sequence_error"];
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    double buf[6] = {totalScore_,
-                     (double)deletions_,
-                     (double)insertions_,
-                     (double)substitutions_,
-                     (double)seqClassficationError_,
-                     (double)numSequences_};
-    client->reduce(buf, buf, 6, FLAGS_trainer_id, 0);
-    totalScore_ = buf[0];
-    deletions_ = (real)buf[1];
-    insertions_ = (real)buf[2];
-    substitutions_ = (real)buf[3];
-    seqClassficationError_ = (int)buf[4];
-    numSequences_ = (int)buf[5];
-  }
-
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + evalResults_.size());
-    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = evalResults_.find(buffers[buffers.size() - 1]);
-
-    if (it == evalResults_.end()) {
-      *err = Error("Evaluator does not have the key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "ctc_edit_distance";
-  }
-};
-
-REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
deleted file mode 100644
index 0ff3f2fa8..000000000
--- a/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-#include "Evaluator.h"
-
-namespace paddle {
-
-/**
- * Chunk evaluator is used to evaluate segment labelling accuracy for a
- * sequence. It calculates the chunk detection F1 score.
- *
- * A chunk is correctly detected if its beginning, end and type are correct.
- * Other chunk type is ignored.
- * For each label in the label sequence, we have
- *
- * @code
- * tagType = label % numTagType
- * chunkType = label / numTagType
- * otherChunkType = numChunkTypes
- * @endcode
- *
- * The total number of different labels is numTagType*numChunkTypes+1
- * We support 4 labelling scheme
- * The tag type for each of the scheme is shown as follows:
- *
- * @code
- *  Scheme Begin Inside End   Single
- *   plain  0     -      -     -
- *   IOB    0     1      -     -
- *   IOE    -     0      1     -
- *   IOBES  0     1      2     3
- * @endcode
- *
- * 'plain' means the whole chunk must contain exactly the same chunk label.
- */
-class ChunkEvaluator : public Evaluator {
-  int otherChunkType_;
-  int numChunkTypes_;  // number of chunk types besides other chunk type
-  int numTagTypes_;
-  int tagBegin_;
-  int tagInside_;
-  int tagEnd_;
-  int tagSingle_;
-
-  int64_t numLabelSegments_;
-  int64_t numOutputSegments_;
-  int64_t numCorrect_;
-
-  struct Segment {
-    int begin;
-    int end;
-    int type;
-    bool operator==(const Segment& y) const {
-      return begin == y.begin && end == y.end && type == y.type;
-    }
-  };
-
-  std::vector<Segment> labelSegments_;
-  std::vector<Segment> outputSegments_;
-  std::set<int> excludedChunkTypes_;
-  mutable std::unordered_map<std::string, real> values_;
-
- public:
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (config.chunk_scheme() == "IOB") {
-      numTagTypes_ = 2;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOE") {
-      numTagTypes_ = 2;
-      tagBegin_ = -1;
-      tagInside_ = 0;
-      tagEnd_ = 1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOBES") {
-      numTagTypes_ = 4;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = 2;
-      tagSingle_ = 3;
-    } else if (config.chunk_scheme() == "plain") {
-      numTagTypes_ = 1;
-      tagBegin_ = -1;
-      tagInside_ = -1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else {
-      LOG(FATAL) << "Unknown chunk scheme: " << config.chunk_scheme();
-    }
-    CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
-    otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
-
-    // the chunks of types in excludedChunkTypes_ will not be counted
-    auto& tmp = config.excluded_chunk_types();
-    excludedChunkTypes_.insert(tmp.begin(), tmp.end());
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numLabelSegments_ = 0;
-    numOutputSegments_ = 0;
-    numCorrect_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << "=" << values_["F1-score"]
-       << " true_chunks=" << numLabelSegments_
-       << " result_chunks=" << numOutputSegments_
-       << " correct_chunks=" << numCorrect_;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    int64_t buf[3] = {numLabelSegments_, numOutputSegments_, numCorrect_};
-    client->reduce(buf, buf, 3, FLAGS_trainer_id, 0);
-    numLabelSegments_ = buf[0];
-    numOutputSegments_ = buf[1];
-    numCorrect_ = buf[2];
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    IVectorPtr& output = arguments[0].ids;
-    IVectorPtr& label = arguments[1].ids;
-    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
-    auto sequenceStartPositions =
-        arguments[1].sequenceStartPositions->getVector(false);
-    CHECK_EQ(output->getSize(), label->getSize());
-    CHECK(sequenceStartPositions);
-    size_t numSequences = sequenceStartPositions->getSize() - 1;
-    const int* starts = sequenceStartPositions->getData();
-    for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i],
-            label->getData() + starts[i],
-            starts[i + 1] - starts[i]);
-    }
-    return 0;
-  }
-
-  void eval1(int* output, int* label, int length) {
-    getSegments(output, length, outputSegments_);
-    getSegments(label, length, labelSegments_);
-    size_t i = 0, j = 0;
-    while (i < outputSegments_.size() && j < labelSegments_.size()) {
-      if (outputSegments_[i] == labelSegments_[j] &&
-          excludedChunkTypes_.count(outputSegments_[i].type) != 1) {
-        ++numCorrect_;
-      }
-      if (outputSegments_[i].end < labelSegments_[j].end) {
-        ++i;
-      } else if (outputSegments_[i].end > labelSegments_[j].end) {
-        ++j;
-      } else {
-        ++i;
-        ++j;
-      }
-    }
-    for (auto& segment : labelSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numLabelSegments_;
-    }
-    for (auto& segment : outputSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numOutputSegments_;
-    }
-  }
-
-  void getSegments(int* label, int length, std::vector<Segment>& segments) {
-    segments.clear();
-    segments.reserve(length);
-    int chunkStart = 0;
-    bool inChunk = false;
-    int tag = -1;
-    int type = otherChunkType_;
-    for (int i = 0; i < length; ++i) {
-      int prevTag = tag;
-      int prevType = type;
-      CHECK_LE(label[i], numChunkTypes_ * numTagTypes_);
-      tag = label[i] % numTagTypes_;
-      type = label[i] / numTagTypes_;
-      if (inChunk && isChunkEnd(prevTag, prevType, tag, type)) {
-        Segment segment{
-            chunkStart,  // begin
-            i - 1,       // end
-            prevType,
-        };
-        segments.push_back(segment);
-        inChunk = false;
-      }
-      if (isChunkBegin(prevTag, prevType, tag, type)) {
-        chunkStart = i;
-        inChunk = true;
-      }
-    }
-    if (inChunk) {
-      Segment segment{
-          chunkStart,  // begin
-          length - 1,  // end
-          type,
-      };
-      segments.push_back(segment);
-    }
-  }
-
-  // whether (prevTag, prevType) is the end of a chunk
-  bool isChunkEnd(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return false;
-    if (type == otherChunkType_) return true;
-    if (type != prevType) return true;
-    if (prevTag == tagBegin_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagInside_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagEnd_) return true;
-    if (prevTag == tagSingle_) return true;
-    return false;
-  }
-
-  // whether (tag, type) is the beginning of a chunk
-  bool isChunkBegin(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return type != otherChunkType_;
-    if (type == otherChunkType_) return false;
-    if (type != prevType) return true;
-    if (tag == tagBegin_) return true;
-    if (tag == tagInside_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagEnd_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagSingle_) return true;
-    return false;
-  }
-
-  // three metrics: precision, recall and F1-score
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + values_.size());
-    for (auto it = values_.begin(); it != values_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  // get value by field name
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = values_.find(buffers.back());
-    if (it == values_.end()) {  // not found
-      *err = Error("No such key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  // get type of evaluator
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "chunk";
-  }
-
- private:
-  void storeLocalValues() const {
-    CHECK_GE(numOutputSegments_, 0);
-    CHECK_GE(numLabelSegments_, 0);
-    double precision =
-        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
-    double recall =
-        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
-    values_["precision"] = precision;
-    values_["recall"] = recall;
-    values_["F1-score"] =
-        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
-  }
-};
-
-REGISTER_EVALUATOR(chunk, ChunkEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
deleted file mode 100644
index 57657241f..000000000
--- a/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/legacy/gserver/layers/DetectionUtil.h"
-
-using std::map;
-using std::vector;
-using std::pair;
-using std::make_pair;
-
-namespace paddle {
-
-/**
- * @brief detection map Evaluator
- *
- * The config file api is detection_map_evaluator.
- */
-class DetectionMAPEvaluator : public Evaluator {
- public:
-  DetectionMAPEvaluator()
-      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    allTruePos_.clear();
-    allFalsePos_.clear();
-    numPos_.clear();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    overlapThreshold_ = config_.overlap_threshold();
-    backgroundId_ = config_.background_id();
-    evaluateDifficult_ = config_.evaluate_difficult();
-    apType_ = config_.ap_type();
-
-    MatrixPtr detectTmpValue = arguments[0].value;
-    Matrix::resizeOrCreate(cpuOutput_,
-                           detectTmpValue->getHeight(),
-                           detectTmpValue->getWidth(),
-                           false,
-                           false);
-
-    MatrixPtr labelTmpValue = arguments[1].value;
-    Matrix::resizeOrCreate(cpuLabel_,
-                           labelTmpValue->getHeight(),
-                           labelTmpValue->getWidth(),
-                           false,
-                           false);
-
-    cpuOutput_->copyFrom(*detectTmpValue);
-    cpuLabel_->copyFrom(*labelTmpValue);
-
-    Argument label = arguments[1];
-    const int* labelIndex = label.sequenceStartPositions->getData(false);
-    size_t batchSize = label.getNumSequences();
-
-    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
-    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      map<size_t, vector<NormalizedBBox>> bboxes;
-      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
-        vector<NormalizedBBox> bbox;
-        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
-        int c = cpuLabel_->getData()[i * 6];
-        bboxes[c].push_back(bbox[0]);
-      }
-      allGTBBoxes.push_back(bboxes);
-    }
-
-    size_t n = 0;
-    const real* cpuOutputData = cpuOutput_->getData();
-    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
-      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
-      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
-        vector<real> label;
-        vector<real> score;
-        vector<NormalizedBBox> bbox;
-        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
-        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
-        ++n;
-        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      }
-      allDetectBBoxes.push_back(bboxes);
-    }
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (map<size_t, vector<NormalizedBBox>>::iterator it =
-               allGTBBoxes[n].begin();
-           it != allGTBBoxes[n].end();
-           ++it) {
-        size_t count = 0;
-        if (evaluateDifficult_) {
-          count = it->second.size();
-        } else {
-          for (size_t i = 0; i < it->second.size(); ++i)
-            if (!(it->second[i].isDifficult)) ++count;
-        }
-        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
-          numPos_[it->first] = count;
-        } else {
-          numPos_[it->first] += count;
-        }
-      }
-    }
-
-    // calcTFPos
-    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
-
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    real mAP = calcMAP();
-    os << "Detection mAP=" << mAP;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Distribute detection evaluation not implemented.";
-  }
-
- protected:
-  void calcTFPos(const size_t batchSize,
-                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
-                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
-                     allDetectBBoxes) {
-    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
-      if (allGTBBoxes[n].size() == 0) {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          for (size_t i = 0; i < it->second.size(); ++i) {
-            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
-            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
-          }
-        }
-      } else {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
-          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
-            }
-          } else {
-            vector<NormalizedBBox> gtBBoxes =
-                allGTBBoxes[n].find(label)->second;
-            vector<bool> visited(gtBBoxes.size(), false);
-            // Sort detections in descend order based on scores
-            std::sort(predBBoxes.begin(),
-                      predBBoxes.end(),
-                      sortScorePairDescend<NormalizedBBox>);
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              real maxOverlap = -1.0;
-              size_t maxIdx = 0;
-              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
-                real overlap =
-                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
-                if (overlap > maxOverlap) {
-                  maxOverlap = overlap;
-                  maxIdx = j;
-                }
-              }
-              if (maxOverlap > overlapThreshold_) {
-                if (evaluateDifficult_ ||
-                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
-                  if (!visited[maxIdx]) {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    visited[maxIdx] = true;
-                  } else {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                  }
-                }
-              } else {
-                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-                allFalsePos_[label].push_back(
-                    make_pair(predBBoxes[i].first, 1));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  real calcMAP() const {
-    real mAP = 0.0;
-    size_t count = 0;
-    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
-         it != numPos_.end();
-         ++it) {
-      size_t label = it->first;
-      size_t labelNumPos = it->second;
-      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
-        continue;
-      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
-      vector<pair<real, size_t>> labelFalsePos =
-          allFalsePos_.find(label)->second;
-      // Compute average precision.
-      vector<size_t> tpCumSum;
-      getAccumulation(labelTruePos, &tpCumSum);
-      vector<size_t> fpCumSum;
-      getAccumulation(labelFalsePos, &fpCumSum);
-      std::vector<real> precision, recall;
-      size_t num = tpCumSum.size();
-      // Compute Precision.
-      for (size_t i = 0; i < num; ++i) {
-        CHECK_LE(tpCumSum[i], labelNumPos);
-        precision.push_back(static_cast<real>(tpCumSum[i]) /
-                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
-        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
-      }
-      // VOC2007 style
-      if (apType_ == "11point") {
-        vector<real> maxPrecisions(11, 0.0);
-        int startIdx = num - 1;
-        for (int j = 10; j >= 0; --j)
-          for (int i = startIdx; i >= 0; --i) {
-            if (recall[i] < j / 10.) {
-              startIdx = i;
-              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
-              break;
-            } else {
-              if (maxPrecisions[j] < precision[i])
-                maxPrecisions[j] = precision[i];
-            }
-          }
-        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
-        ++count;
-      } else if (apType_ == "Integral") {
-        // Nature integral
-        real averagePrecisions = 0.;
-        real prevRecall = 0.;
-        for (size_t i = 0; i < num; ++i) {
-          if (fabs(recall[i] - prevRecall) > 1e-6)
-            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
-          prevRecall = recall[i];
-        }
-        mAP += averagePrecisions;
-        ++count;
-      } else {
-        LOG(FATAL) << "Unkown ap version: " << apType_;
-      }
-    }
-    if (count != 0) mAP /= count;
-    return mAP * 100;
-  }
-
-  void getAccumulation(vector<pair<real, size_t>> inPairs,
-                       vector<size_t>* accuVec) const {
-    std::stable_sort(
-        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
-    accuVec->clear();
-    size_t sum = 0;
-    for (size_t i = 0; i < inPairs.size(); ++i) {
-      sum += inPairs[i].second;
-      accuVec->push_back(sum);
-    }
-  }
-
-  std::string getTypeImpl() const { return "detection_map"; }
-
-  real getValueImpl() const { return calcMAP(); }
-
- private:
-  real overlapThreshold_;  // overlap threshold when determining whether matched
-  bool evaluateDifficult_;  // whether evaluate difficult ground truth
-  size_t backgroundId_;     // class index of background
-  std::string apType_;      // how to calculate mAP (Integral or 11point)
-
-  MatrixPtr cpuOutput_;
-  MatrixPtr cpuLabel_;
-
-  map<size_t, size_t> numPos_;  // counts of true objects each classification
-  map<size_t, vector<pair<real, size_t>>>
-      allTruePos_;  // true positive prediction
-  map<size_t, vector<pair<real, size_t>>>
-      allFalsePos_;  // false positive prediction
-};
-
-REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.cpp b/paddle/legacy/gserver/evaluators/Evaluator.cpp
deleted file mode 100644
index a956f40d0..000000000
--- a/paddle/legacy/gserver/evaluators/Evaluator.cpp
+++ /dev/null
@@ -1,1361 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/gserver/evaluators/Evaluator.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-void Evaluator::eval(const NeuralNetwork& nn) {
-  std::vector<Argument> arguments;
-  arguments.reserve(config_.input_layers_size());
-  for (const std::string& name : config_.input_layers()) {
-    arguments.push_back(nn.getLayer(name)->getOutput());
-  }
-  SetDevice device(arguments[0].deviceId);
-  real score = evalImp(arguments);
-  totalScore_ += score;
-  updateSamplesNum(arguments);
-}
-/**
- * @brief classification error Evaluator
- *
- * The config file api is classification_error_evaluator.
- */
-class ClassificationErrorEvaluator : public Evaluator {
- public:
-  /*
-  ClassificationErrorEvaluator() : totalScore2_(0) {}
-
-  virtual void start() {
-    Evaluator::start();
-    totalScore2_ = 0;
-    } */
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (3 == arguments.size()) {
-      numSamples_ += arguments[2].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  MatrixPtr calcError(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), (size_t)2);
-    CHECK_LE(arguments.size(), (size_t)3);
-    MatrixPtr& output = arguments[0].value;
-    IVectorPtr& label = arguments[1].ids;
-    MatrixPtr& multiBinaryLabel = arguments[1].value;  // For multi binary label
-    bool supportWeight = (3 == arguments.size()) ? true : false;
-    MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-    if (nullptr == output ||
-        (nullptr == label && nullptr == multiBinaryLabel) ||
-        (supportWeight && nullptr == weight)) {
-      return 0;
-    }
-
-    if (label != nullptr) {
-      CHECK_EQ(label->getSize(), output->getHeight());
-    } else {
-      CHECK_EQ(multiBinaryLabel->getHeight(), output->getHeight());
-      CHECK_EQ(multiBinaryLabel->getWidth(), output->getWidth());
-    }
-    if (supportWeight) {
-      CHECK_EQ(output->getHeight(), weight->getHeight());
-      CHECK_EQ((size_t)1, weight->getWidth());
-    }
-
-    const MatrixPtr errorMat = Matrix::create(output->getHeight(),
-                                              1,
-                                              /* trans= */ false,
-                                              useGpu(arguments[0].deviceId));
-
-    errorMat->zeroMem();
-
-    if (label != nullptr) {
-      errorMat->classificationError(*output, *label, config_.top_k());
-    } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
-               dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
-      errorMat->classificationErrorMulti(
-          *output, *multiBinaryLabel, config_.classification_threshold());
-    } else {
-      errorMat->binaryClassificationError(
-          0, *output, *multiBinaryLabel, config_.classification_threshold());
-    }
-
-    if (supportWeight) {
-      errorMat->dotMul(*errorMat, *weight);
-    }
-    return errorMat;
-  }
-
-  void printStats(std::ostream& os) const {
-    if (config_.top_k() == 1) {
-      os << config_.name() << "="
-         << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    } else {
-      os << " top_" << config_.top_k()
-         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-    return errorMat->getSum();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "classification_error"; }
-};
-
-/**
- * @brief sequence classification error Evaluator
- * @note sequence level classification error stats,
- * if any frame in one sequence has error, the sequence is error
- */
-class SequenceClassificationErrorEvaluator
-    : public ClassificationErrorEvaluator {
- public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getNumSequences();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    auto sequenceStartPositions =
-        arguments[0].sequenceStartPositions->getVector(false);
-    CHECK(sequenceStartPositions != nullptr);
-    const int* starts = sequenceStartPositions->getData();
-
-    MatrixPtr errorMat = calcError(arguments);
-
-    int errCounter = 0;
-    CpuVector errorVec(0, nullptr);
-    for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
-      errorVec.subVecFrom(
-          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
-      if (errorVec.getSum() > 0) {
-        errCounter += 1;
-      }
-    }
-
-    return static_cast<real>(errCounter);
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "seq_classification_error"; }
-};
-REGISTER_EVALUATOR(seq_classification_error,
-                   SequenceClassificationErrorEvaluator);
-/**
- * @brief sum Evaluator
- * Calculate the sum of output or label
- *
- * The config file api is sum_evaluator.
- */
-class SumEvaluator : public Evaluator {
- public:
-  SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("SumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (supportWeight) {
-      if (nullptr == arguments[1].value) {
-        return 0;
-      }
-      CHECK_EQ(arguments[1].value->getWidth(), (size_t)1);
-    }
-
-    // The sum of output
-    if (arguments[0].value) {
-      if (supportWeight) {
-        CHECK_EQ(arguments[0].value->getHeight(),
-                 arguments[1].value->getHeight());
-        MatrixPtr tmpMat = Matrix::create(arguments[0].value->getHeight(),
-                                          arguments[0].value->getWidth(),
-                                          /* trans= */ false,
-                                          arguments[0].value->useGpu());
-        tmpMat->copyFrom(*arguments[0].value);
-        tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        return tmpMat->getSum();
-      } else {
-        return arguments[0].value->getSum();
-      }
-      // The sum of label
-    } else if (arguments[0].ids) {
-      size_t insNum = arguments[0].ids->getSize();
-      IVectorPtr label = arguments[0].ids;
-      MatrixPtr weight = supportWeight ? arguments[1].value : nullptr;
-      if (dynamic_cast<GpuIVector*>(label.get())) {
-        IVector::resizeOrCreate(cpuLabel_, insNum, false);
-        cpuLabel_->copyFrom(*arguments[0].ids);
-
-        if (supportWeight) {
-          CHECK_EQ(insNum, arguments[1].value->getHeight());
-          Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-          cpuWeight_->copyFrom(*arguments[1].value);
-        }
-
-        label = cpuLabel_;
-        weight = cpuWeight_;
-      }
-
-      if (supportWeight) {
-        real score = 0.0;
-        int* labelD = label->getData();
-        real* weightD = weight->getData();
-        for (size_t i = 0; i < insNum; ++i) {
-          score += (labelD[i] * weightD[i]);
-        }
-        return score;
-      } else {
-        return label->getSum();
-      }
-    } else {
-      return 0;
-    }
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
- private:
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const { return "sum"; }
-};
-/**
- * @brief column sum Evaluator
- * @note column sum for the colIdx-th column *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is column_sum_evaluator.
- *
- */
-class ColumnSumEvaluator : public Evaluator {
- public:
-  explicit ColumnSumEvaluator(int32_t colIdx)
-      : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    if (nullptr != sum_) {
-      sum_->zeroMem();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("ColumnSumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (nullptr == arguments[0].value ||
-        (supportWeight && nullptr == arguments[1].value)) {
-      return 0;
-    }
-
-    size_t insNum = arguments[0].value->getHeight();
-    size_t colNum = arguments[0].value->getWidth();
-    if (nullptr == sum_) {
-      sum_ = Matrix::create((size_t)1, colNum, false, /* useGpu */ false);
-      colNum_ = colNum;
-      sum_->zeroMem();
-    } else {
-      CHECK_EQ(colNum, sum_->getWidth());
-    }
-
-    if (supportWeight) {
-      CHECK_EQ(insNum, arguments[1].value->getHeight());
-      CHECK_EQ((size_t)1, arguments[1].value->getWidth());
-      MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-      if (arguments[0].value->useGpu()) {
-        tmpMat->copyFrom(*arguments[0].value);
-      }
-      if (!arguments[1].value->useGpu()) {
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *arguments[1].value);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        }
-      } else {
-        MatrixPtr tmp2 = Matrix::create(insNum, 1);
-        tmp2->copyFrom(*arguments[1].value);
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *tmp2);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *tmp2);
-        }
-      }
-      sum_->accumulateColSum(*tmpMat);
-    } else {
-      if (!arguments[0].value->useGpu()) {
-        sum_->accumulateColSum(*arguments[0].value);
-      } else {
-        MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-        tmpMat->copyFrom(*arguments[0].value);
-        sum_->accumulateColSum(*tmpMat);
-      }
-    }
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
-        << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
-        << colNum_ << ")";
-    size_t colIdx = 0;
-    if (colIdx_ >= 0) {
-      colIdx = colIdx_;
-    } else {
-      colIdx = colNum_ + colIdx_;
-    }
-    os << config_.name() << "="
-       << (numSamples_ ? sum_->getElement(0, colIdx) / numSamples_ : 0);
-  }
-
-  void distributeEval(ParameterClient2* client) {
-    client->reduce(
-        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
-    client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
-  }
-
- private:
-  int32_t colIdx_;
-  size_t colNum_;
-  MatrixPtr sum_; /* cpu matrix */
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const {
-    if (colIdx_ == -1)
-      return "last-column-sum";
-    else
-      return "column-sum";
-  }
-};
-
-void AucEvaluator::start() {
-  Evaluator::start();
-  memset(statPos_, 0, sizeof(statPos_));
-  memset(statNeg_, 0, sizeof(statNeg_));
-}
-
-real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("AucEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr labelval = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-
-  if (nullptr == output || (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  // Copy label from value to a vector.
-  if (nullptr == label && nullptr != labelval) {
-    // label width is 1
-    CHECK_EQ(1U, labelval->getWidth());
-    VectorPtr vec =
-        Vector::create(labelval->getData(), insNum, output->useGpu());
-    label = vec->castToInt();
-  }
-
-  CHECK_EQ(insNum, label->getSize());
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  CHECK(colIdx_ + (int32_t)outputDim >= 0 && colIdx_ - (int32_t)outputDim < 0)
-      << "column index [" << colIdx_ << "] out of range [-" << outputDim << ", "
-      << outputDim << ")";
-  realColumnIdx_ = 0;
-  if (colIdx_ >= 0) {
-    realColumnIdx_ = colIdx_;
-  } else {
-    realColumnIdx_ = outputDim + colIdx_;
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_,
-                           insNum,
-                           outputDim,
-                           /* trans=*/false,
-                           /* useGpu=*/false);
-    cpuOutput_->copyFrom(*output);
-    IVector::resizeOrCreate(cpuLabel_, insNum, false);
-    cpuLabel_->copyFrom(*label);
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-    }
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    weight = cpuWeight_;
-  }
-
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = supportWeight ? weight->getData() : nullptr;
-  size_t pos = realColumnIdx_;
-
-  for (size_t i = 0; i < insNum; ++i) {
-    real value = outputD[pos];
-    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
-    CHECK(binIdx <= kBinNum_) << "bin index [" << binIdx
-                              << "] out of range, predict value[" << value
-                              << "]";
-    real w = supportWeight ? weightD[i] : 1.0;
-    if (labelD[i] == kNegativeLabel_) {
-      statNeg_[binIdx] += w;
-    } else {
-      statPos_[binIdx] += w;
-    }
-    pos += outputDim;
-  }
-  return 0;
-}
-
-void AucEvaluator::distributeEval(ParameterClient2* client) {
-  client->reduce(statPos_, statPos_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-  client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-}
-
-double AucEvaluator::calcAuc() const {
-  double totPos = 0.0;
-  double totNeg = 0.0;
-  double totPosPrev = 0.0;
-  double totNegPrev = 0.0;
-  double auc = 0.0;
-
-  int64_t idx = kBinNum_;
-  while (idx >= 0) {
-    totPosPrev = totPos;
-    totNegPrev = totNeg;
-    totPos += statPos_[idx];
-    totNeg += statNeg_[idx];
-    auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-    --idx;
-  }
-
-  if (totPos > 0.0 && totNeg > 0.0) {
-    return auc / totPos / totNeg;
-  } else {
-    return 0.0;
-  }
-}
-
-real AucEvaluator::getValueImpl() const { return calcAuc(); }
-
-std::string AucEvaluator::getTypeImpl() const {
-  if (colIdx_ == -1) {
-    return "last-column-auc";
-  } else {
-    return "auc";
-  }
-}
-
-// class RankAucEvaluator
-REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
-
-void RankAucEvaluator::start() { Evaluator::start(); }
-void RankAucEvaluator::updateSamplesNum(
-    const std::vector<Argument>& arguments) {
-  numSamples_ += arguments[0].getNumSequences();
-}
-real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 2U);
-  CHECK_LE(arguments.size(), 3U);
-  double batchAuc = 0.0;
-  output_ = arguments[0].value;
-  click_ = arguments[1].value;
-  size_t batchSize = output_->getHeight();
-  CHECK(!output_->useGpu()) << "RankAUC evaluator does not support GPU!";
-
-  if (arguments.size() == 3U) {
-    pv_ = arguments[2].value;
-  } else {
-    Matrix::resizeOrCreate(pv_, batchSize, 1, false, false);
-    std::fill(pv_->getData(), pv_->getData() + batchSize, 1.0);
-  }
-
-  real* outputData = output_->getData();
-  real* clickData = click_->getData();
-  real* pvData = pv_->getData();
-
-  auto startPos = arguments[0].sequenceStartPositions->getVector(false);
-  const int* startPosData = startPos->getData();
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    batchAuc += calcRankAuc(outputData + beginPos,
-                            clickData + beginPos,
-                            pvData + beginPos,
-                            endPos - beginPos);
-  }
-  return batchAuc;
-}
-
-double RankAucEvaluator::calcRankAuc(real* outputData,
-                                     real* clickData,
-                                     real* pvData,
-                                     size_t size) {
-  outputPair_.clear();
-  for (size_t i = 0; i < size; ++i) {
-    outputPair_.push_back(std::make_pair(outputData[i], i));
-  }
-  std::sort(outputPair_.begin(),
-            outputPair_.end(),
-            [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-              return a.first > b.first;
-            });
-  double aucTmp = 0.0;
-  double clickSum = 0.0;
-  double oldClickSum = 0.0;
-  double noClick = 0.0;
-  double noClickSum = 0.0;
-
-  double lastScore = outputPair_[0].first + 1.0;
-  for (size_t i = 0; i < size; ++i) {
-    if (lastScore != outputPair_[i].first) {
-      aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-      oldClickSum = clickSum;
-      noClick = 0.0;
-      lastScore = outputPair_[i].first;
-    }
-    size_t id = outputPair_[i].second;
-    noClick += pvData[id] - clickData[id];
-    noClickSum += noClick;
-    clickSum += clickData[id];
-  }
-  aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-  return (clickSum * noClickSum) == 0.0 ? 0.0
-                                        : aucTmp / (clickSum * noClickSum);
-}
-
-std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
-
-// class PrecisionRecallEvaluator
-REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
-
-void PrecisionRecallEvaluator::start() {
-  Evaluator::start();
-  statsInfo_.clear();
-  values_.clear();
-}
-
-real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("PrecisionRecallEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr multiBinaryLabel = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-  if (nullptr == output || (nullptr == label && nullptr == multiBinaryLabel) ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  if (label != nullptr) {
-    CHECK_EQ(insNum, label->getSize());
-  } else {
-    CHECK_EQ(insNum, multiBinaryLabel->getHeight());
-    CHECK_EQ(outputDim, multiBinaryLabel->getWidth());
-  }
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (statsInfo_.size() != outputDim) {
-    statsInfo_.clear();
-    statsInfo_.resize(outputDim);
-  }
-
-  isMultiBinaryLabel_ = (nullptr == label) ? true : false;
-  if (label != nullptr) {
-    if (dynamic_cast<GpuMatrix*>(output.get())) {
-      Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, false, false);
-      cpuOutput_->copyFrom(*output);
-      IVector::resizeOrCreate(cpuLabel_, insNum, false);
-      cpuLabel_->copyFrom(*label);
-      if (supportWeight) {
-        Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-        cpuWeight_->copyFrom(*weight);
-      }
-
-      output = cpuOutput_;
-      label = cpuLabel_;
-      weight = cpuWeight_;
-    }
-    calcStatsInfo(output, label, weight);
-  } else {
-    // Not support GPU for multi binary labels
-    CHECK(dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()));
-    calcStatsInfoMulti(output, multiBinaryLabel, weight);
-  }
-  return 0;
-}
-
-void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
-  PrintStatsInfo info;
-  bool containMacroMicroInfo = getStatsInfo(&info);
-  os << "positive_label=" << config_.positive_label()
-     << " precision=" << info.precision << " recall=" << info.recall
-     << " F1-score=" << info.f1;
-  if (containMacroMicroInfo) {
-    os << "macro-average-precision=" << info.macroAvgPrecision
-       << " macro-average-recall=" << info.macroAvgRecall
-       << " macro-average-F1-score=" << info.macroAvgF1Score;
-    if (!isMultiBinaryLabel_) {
-      // precision and recall are equal in this case
-      os << " micro-average-precision=" << info.microAvgPrecision;
-    } else {
-      os << " micro-average-precision=" << info.microAvgPrecision
-         << " micro-average-recall=" << info.microAvgRecall
-         << " micro-average-F1-score=" << info.microAvgF1Score;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfo(const MatrixPtr& output,
-                                             const IVectorPtr& label,
-                                             const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  for (size_t i = 0; i < insNum; ++i) {
-    CHECK_GE(labelD[i], 0);
-    CHECK_LT((size_t)labelD[i], dim);
-    size_t maxIdx = 0;
-    real maxValue = outputD[i * dim];
-    for (size_t j = 1; j < dim; ++j) {
-      size_t idx = i * dim + j;
-      if (maxValue < outputD[idx]) {
-        maxIdx = j;
-        maxValue = outputD[idx];
-      }
-    }
-
-    real w = (weightD != nullptr) ? weightD[i] : 1.0;
-    if (maxIdx == (size_t)labelD[i]) {
-      statsInfo_[maxIdx].TP += w;  // true positive for labelD[i]
-      // true negative for all labels except for labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-    } else {
-      statsInfo_[labelD[i]].FN += w;  // false negative for labelD[i]
-      statsInfo_[maxIdx].FP += w;     // false positive for maxIdx
-      // true negatives for all labels except for maxIdx and labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-      statsInfo_[labelD[i]].TN -= w;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
-                                                  const MatrixPtr& label,
-                                                  const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  auto labelD = dynamic_cast<CpuSparseMatrix*>(label.get());
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  real threshold = config_.classification_threshold();
-  for (size_t i = 0; i < insNum; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + j;
-      if (outputD[idx] < threshold) {
-        statsInfo_[j].TN += w;  // true negative
-      } else {
-        statsInfo_[j].FP += w;  // false positive
-      }
-    }
-
-    const int* cols = labelD->getRowCols(i);
-    for (size_t j = 0; j < labelD->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + cols[j];
-      if (outputD[idx] < threshold) {
-        statsInfo_[cols[j]].FN += w;  // false negative
-        statsInfo_[cols[j]].TN -= w;  // true negative
-      } else {
-        statsInfo_[cols[j]].TP += w;  // true positive
-        statsInfo_[cols[j]].FP -= w;  // false positive
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::storeLocalValues() const {
-  if (this->values_.size() == 0) {
-    PrintStatsInfo info;
-    bool containMacroMicroInfo = getStatsInfo(&info);
-    values_["precision"] = info.precision;
-    values_["recal"] = info.recall;
-    values_["F1-score"] = info.f1;
-    if (containMacroMicroInfo) {
-      values_["macro-average-precision"] = info.macroAvgPrecision;
-      values_["macro-average-recall"] = info.macroAvgRecall;
-      values_["macro-average-F1-score"] = info.macroAvgF1Score;
-      if (!isMultiBinaryLabel_) {
-        // precision and recall are equal in this case
-        values_["micro-average-precision"] = info.microAvgPrecision;
-      } else {
-        values_["micro-average-precision"] = info.microAvgPrecision;
-        values_["micro-average-recall"] = info.microAvgRecall;
-        values_["micro-average-F1-score"] = info.microAvgF1Score;
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
-  this->storeLocalValues();
-  names->reserve(this->values_.size());
-  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
-    names->push_back(this->config_.name() + "." + it->first);
-  }
-}
-
-real PrecisionRecallEvaluator::getValue(const std::string& name,
-                                        Error* err) const {
-  this->storeLocalValues();
-  std::vector<std::string> buffers;
-  paddle::str::split(name, '.', &buffers);
-  auto it = this->values_.find(buffers[buffers.size() - 1]);
-  if (it == this->values_.end()) {  // not found
-    *err = Error("No such key %s", name.c_str());
-    return .0f;
-  }
-
-  return it->second;
-}
-
-std::string PrecisionRecallEvaluator::getType(const std::string& name,
-                                              Error* err) const {
-  this->getValue(name, err);
-  if (!err->isOK()) {
-    return "";
-  }
-  return "precision_recall";
-}
-
-void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
-  size_t size = 4 * statsInfo_.size();
-  double* buf = new double[size];
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    buf[4 * i + 0] = statsInfo_[i].TP;
-    buf[4 * i + 1] = statsInfo_[i].TN;
-    buf[4 * i + 2] = statsInfo_[i].FP;
-    buf[4 * i + 3] = statsInfo_[i].FN;
-  }
-  client->reduce(buf, buf, size, FLAGS_trainer_id, 0);
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    statsInfo_[i].TP = buf[4 * i + 0];
-    statsInfo_[i].TN = buf[4 * i + 1];
-    statsInfo_[i].FP = buf[4 * i + 2];
-    statsInfo_[i].FN = buf[4 * i + 3];
-  }
-  delete[] buf;
-}
-
-bool PrecisionRecallEvaluator::getStatsInfo(
-    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
-  int label = config_.positive_label();
-  if (label != -1) {
-    CHECK(label >= 0 && label < (int)statsInfo_.size())
-        << "positive_label [" << label << "] should be in range [0, "
-        << statsInfo_.size() << ")";
-    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
-    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
-    info->f1 = calcF1Score(info->precision, info->recall);
-    return false;
-  }
-
-  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
-  // macro average method: precision = (precision1+precision2)/2
-  double microTotalTP = 0;
-  double microTotalFP = 0;
-  double microTotalFN = 0;
-  info->macroAvgPrecision = 0;
-  info->macroAvgRecall = 0;
-  size_t numLabels = statsInfo_.size();
-  for (size_t i = 0; i < numLabels; ++i) {
-    microTotalTP += statsInfo_[i].TP;
-    microTotalFP += statsInfo_[i].FP;
-    microTotalFN += statsInfo_[i].FN;
-    info->macroAvgPrecision +=
-        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
-    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
-  }
-  info->macroAvgPrecision /= numLabels;
-  info->macroAvgRecall /= numLabels;
-  info->macroAvgF1Score =
-      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
-
-  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
-  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
-  info->microAvgF1Score =
-      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
-  return true;
-}
-
-REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
-void PnpairEvaluator::start() {
-  Evaluator::start();
-  memset(pairArray_, 0, sizeof(pairArray_));
-  predictArray_.clear();
-}
-
-real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 3UL);
-  CHECK_LE(arguments.size(), 4UL);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  IVectorPtr info = arguments[2].ids;
-  bool supportWeight = (4 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[3].value : nullptr;
-  if (nullptr == output || nullptr == label ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t height = output->getHeight();
-  size_t width = output->getWidth();
-  CHECK_EQ(height, label->getSize());
-  CHECK_EQ(height, info->getSize());
-  if (supportWeight) {
-    CHECK_EQ(height, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_, height, width, false, false);
-    IVector::resizeOrCreate(cpuLabel_, height, false);
-    IVector::resizeOrCreate(cpuInfo_, height, false);
-    cpuOutput_->copyFrom(*output);
-    cpuLabel_->copyFrom(*label);
-    cpuInfo_->copyFrom(*info);
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    info = cpuInfo_;
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-      weight = cpuWeight_;
-    }
-  }
-
-  real* outputs = output->getData();
-  int* labels = label->getData();
-  int* infos = info->getData();
-  real* weights = supportWeight ? weight->getData() : nullptr;
-  for (size_t i = 0; i < output->getHeight(); i++) {
-    real y1 = outputs[i * width + (width - 1)];
-    real w = supportWeight ? weights[i] : 1.0;
-    predictArray_.push_back(PredictionResult(y1, labels[i], infos[i], w));
-  }
-  return 0;
-}
-
-void PnpairEvaluator::stat(size_t start,
-                           size_t end,
-                           PredictionResult* answers,
-                           double& pos,
-                           double& neg,
-                           double& spe) {
-  for (size_t i = start; i < end; i++) {
-    for (size_t j = i + 1; j < end; j++) {
-      CHECK_EQ(answers[i].queryid, answers[j].queryid);
-      // The pair weight is the mean of the two samples' weight
-      double weight = (answers[i].weight + answers[j].weight) / 2.0;
-      if (answers[i].label != answers[j].label) {
-        if ((answers[i].out > answers[j].out &&
-             answers[i].label > answers[j].label) ||
-            (answers[i].out < answers[j].out &&
-             answers[i].label < answers[j].label)) {
-          pos += weight;
-        } else if ((answers[i].out > answers[j].out &&
-                    answers[i].label < answers[j].label) ||
-                   (answers[i].out < answers[j].out &&
-                    answers[i].label > answers[j].label)) {
-          neg += weight;
-        } else {
-          spe += weight;
-        }
-      }
-    }
-  }
-}
-
-void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
-  std::sort(predictArray.begin(),
-            predictArray.end(),
-            [](const PredictionResult& x, const PredictionResult& y) {
-              return x.queryid < y.queryid;
-            });
-
-  double pos = 0;
-  double neg = 0;
-  double special = 0;
-  auto start = predictArray.begin();
-  while (start != predictArray.end()) {
-    auto end = std::find_if(
-        start + 1, predictArray.end(), [=](const PredictionResult& x) {
-          return x.queryid != start->queryid;
-        });
-    CHECK(end != start);
-    stat(start - predictArray.begin(),
-         end - predictArray.begin(),
-         predictArray.data(),
-         pos,
-         neg,
-         special);
-
-    start = end;
-  }
-
-  pairArray_[0] += pos;
-  pairArray_[1] += neg;
-
-  LOG(INFO) << " calc total pos pair: " << pos
-            << " calc total neg pair: " << neg
-            << " calc total special pair: " << special;
-}
-
-std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
-
-ClassRegistrar<Evaluator> Evaluator::registrar_;
-Evaluator* Evaluator::create(const EvaluatorConfig& config) {
-  Evaluator* evaluator = registrar_.createByType(config.type());
-  evaluator->init(config);
-  return evaluator;
-}
-
-REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
-REGISTER_EVALUATOR(sum, SumEvaluator);
-static InitFunction __reg_type_auc_sum__([]() {
-  Evaluator::registrar_.registerClass(
-      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
-  Evaluator::registrar_.registerClass("last-column-auc",
-                                      [] { return new AucEvaluator(-1); });
-});
-
-/**
- * @brief print value of each layer.
- *
- * The config file api is value_printer_evaluator.
- */
-class ValuePrinter : public NotGetableEvaluator {
- public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
-                                                      "layer=" + name + " ");
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(value_printer, ValuePrinter);
-
-/**
- * @brief print gradient of each layer.
- *
- * The config file api is gradient_printer_evaluator.
- */
-class GradientPrinter : public NotGetableEvaluator {
- public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.grad) {
-        std::ostringstream os;
-        argu.grad->print(os);
-        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
-/**
- * @brief print row max id vctor of each layer
- *
- * The config file api is maxid_printer_evaluator.
- */
-class MaxIdPrinter : public NotGetableEvaluator {
- private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-
- public:
-  MaxIdPrinter() {}
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        size_t height = argu.value->getHeight();
-        size_t width = config_.num_results();
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-        argu.value->rowMax(*maxIds_, *maxValues_);
-        std::ostringstream os;
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t i = 0; i < height; ++i) {
-          for (size_t j = 0; j < width; ++j) {
-            size_t pos = i * width + j;
-            os << ids[pos] << " : " << values[pos] << ", ";
-          }
-          os << std::endl;
-        }
-        LOG(INFO) << "layer=" << name << " row max id vector:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
-/**
- * @brief print sequence max frames of each layer
- *
- * The config file api is maxframe_printer_evaluator.
- */
-class MaxFramePrinter : public NotGetableEvaluator {
- private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-  MatrixPtr value_;
-
- public:
-  MaxFramePrinter() {
-    value_ =
-        Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-
-      CHECK_EQ(argu.value->getWidth(), 1LU);
-      size_t numSequences = argu.getNumSequences();
-      const int* starts = argu.sequenceStartPositions->getData(false);
-
-      std::ostringstream os;
-      for (size_t i = 0; i < numSequences; ++i) {
-        size_t offset = starts[i];
-        size_t size = starts[i + 1] - starts[i];
-        value_->setData(argu.value->getData() + offset, 1LU, size);
-
-        size_t height = 1LU;
-        size_t width = std::min((size_t)config_.num_results(), size);
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-
-        value_->rowMax(*maxIds_, *maxValues_);
-
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t j = 0; j < width; ++j) {
-          os << ids[j] << " : " << values[j] << ", ";
-        }
-        os << "total " << size << " frames" << std::endl;
-      }
-      LOG(INFO) << "layer=" << name << " sequence max frames:\n" << os.str();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
-
-/**
- * @brief print text according to index matrix and a dictionary.
- *
- * There can be multiple input to this layer:
- * - If there is only one input, the input must be a matrix containing
- *      the sequence of indices;
- * - If there are more than one input, the first input should be ids,
- *      and are interpreted as sample ids.
- *
- * The output format will be:
- *
- * - sequence without sub-sequence, and there is probability.
- *
- *     @code
- *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence without sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence with sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      ...
- *     @endcode
- *
- * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
- * with maxid (when generating) as an input.
- *
- * The config file api is seqtext_printer_evaluator.
- *
- */
-class SequenceTextPrinter : public NotGetableEvaluator {
- private:
-  /// dict_file, which contains a list of tokens
-  std::vector<std::string> dict_;
-  /// result_file, which is the output file
-  std::ofstream os_;
-  /// True/False, to indicate whether to use space to separate output tokens.
-  /// Default is True. No space is added if set to False.
-  bool delimited_;
-  /// store the cpu version of argument.ids
-  std::vector<IVectorPtr> cpuIds_;
-  /// store the probability associated with each sequence
-  std::vector<MatrixPtr> cpuIn_;
-
- public:
-  SequenceTextPrinter() {}
-
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (!config.dict_file().empty()) {
-      loadFileList(config.dict_file(), dict_);
-    }
-
-    os_.open(config.result_file(), std::ofstream::trunc);
-    CHECK(os_.is_open()) << "Failed to open file " << config.result_file();
-    delimited_ = config.delimited();
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), 1LU);
-    bool hasId = arguments.size() > 1;
-    size_t numSequences = arguments[0].getNumSequences();
-    if (hasId) {
-      CHECK_EQ(arguments[0].ids->getSize(), numSequences)
-          << "first input must be sample id.";
-    }
-    for (size_t i = hasId ? 1 : 0; i < arguments.size(); ++i) {
-      CHECK_EQ((size_t)arguments[i].getNumSequences(), numSequences);
-    }
-
-    auto resizeVector = [](IVectorPtr& dest, const IVectorPtr& src) {
-      if (src && src->useGpu()) {
-        IVector::resizeOrCreate(dest, src->getSize(), false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
-      if (src && src->useGpu()) {
-        Matrix::resizeOrCreate(
-            dest, src->getHeight(), src->getWidth(), false, false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    cpuIds_.resize(arguments.size());
-    cpuIn_.resize(arguments.size());
-    for (size_t i = 0; i < arguments.size(); ++i) {
-      resizeVector(cpuIds_[i], arguments[i].ids);
-      resizeMatrix(cpuIn_[i], arguments[i].in);
-    }
-
-    int* sampleIds = nullptr;
-    if (hasId) {
-      sampleIds = cpuIds_[0]->getData();
-    }
-
-    for (size_t i = 0; i < numSequences; ++i) {
-      os_ << (hasId ? sampleIds[i] : i);
-      for (size_t j = hasId ? 1 : 0; j < arguments.size(); ++j) {
-        int* output = cpuIds_[j]->getData();
-        const int* starts = arguments[j].sequenceStartPositions->getData(false);
-
-        auto seqPrint = [&](int start, int end) {
-          os_ << "\t";
-          for (int k = start; k < end; k++) {
-            int id = output[k];
-            os_ << (delimited_ ? " " : "");
-            if (!dict_.empty()) {
-              CHECK_LT((size_t)id, dict_.size());
-              os_ << dict_[id];
-            } else {
-              os_ << id;
-            }
-          }
-        };
-
-        if (arguments[j].hasSubseq()) {
-          // print sequence with sub-sequence
-          const int* subStarts =
-              arguments[j].subSequenceStartPositions->getData(false);
-          int subSeqId_start = 0;
-          int subSeqId_end = 0;
-          for (size_t k = 0; k < (size_t)arguments[j].getNumSubSequences() + 1;
-               ++k) {
-            if (starts[i] == subStarts[k]) subSeqId_start = k;
-            if (starts[i + 1] == subStarts[k]) subSeqId_end = k;
-          }
-          for (int k = subSeqId_start; k < subSeqId_end; k++) {
-            seqPrint(subStarts[k], subStarts[k + 1]);
-            os_ << std::endl;
-          }
-
-        } else {
-          // print sequence without sub-sequence
-          if (arguments[j].in) {  // beam print
-            real* probs = cpuIn_[j]->rowBuf(i);
-            os_ << std::endl;
-            int start = starts[i];
-            int seqEnd = starts[i + 1];
-            for (size_t k = 0; k < arguments[j].in->getWidth(); ++k) {
-              if (start == seqEnd) {
-                break;
-              }
-              int end = start + output[start] + 2;
-              CHECK_LE(end, seqEnd);
-              CHECK_EQ(output[end - 1], -1);
-              os_ << k << "\t" << probs[k];
-              seqPrint(start + 1, end - 1);
-              os_ << std::endl;
-              start = end;
-            }
-          } else {
-            seqPrint(starts[i], starts[i + 1]);
-          }
-        }
-      }
-      os_ << std::endl;
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
-/**
- * @brief print classification error.
- *
- * The config file api is classification_error_printer_evaluator.
- */
-class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
- public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-
-    std::ostringstream os;
-    errorMat->print(os);
-    LOG(INFO) << "Printer=" << config_.name() << " Classification Error:\n"
-              << os.str();
-
-    if (auto startPos = arguments[0].sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "Printer=" << config_.name() << " sequence pos vector:\n"
-                << os.str();
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
-
-std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.h b/paddle/legacy/gserver/evaluators/Evaluator.h
deleted file mode 100644
index b3462819b..000000000
--- a/paddle/legacy/gserver/evaluators/Evaluator.h
+++ /dev/null
@@ -1,510 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/pserver/ParameterClient2.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Error.h"
-
-namespace paddle {
-
-class NeuralNetwork;
-/**
- * @def REGISTER_EVALUATOR
- * @brief Macro for registering evaluator class
- */
-
-#define REGISTER_EVALUATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                \
-    Evaluator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-/**
- * @brief Base class for Evaluator
- * Evaluating the performance of a model is very important.
- * It indicates how successful the scores(predictions) of a datasets
- * has been by a trained model.
- */
-class Evaluator {
- public:
-  static Evaluator* create(const EvaluatorConfig& config);
-
-  Evaluator() : numSamples_(0), totalScore_(0) {}
-
-  virtual ~Evaluator() {}
-
-  virtual void init(const EvaluatorConfig& config) { config_ = config; }
-
-  /**
-   * @brief start to evaluate some data
-   */
-  virtual void start() {
-    numSamples_ = 0;
-    totalScore_ = 0;
-  }
-
-  /**
-   * @brief Process a batch of data.
-   */
-  virtual void eval(const NeuralNetwork& nn);
-
-  /**
-   * @brief Process a batch of data.
-   * @return the score for the batch if it make sense to sum the score across
-   * batches.
-   * @note Otherwise evaluator should return 0 and override finish() and
-   * printStats() to do the right calculation.
-   */
-  virtual real evalImp(std::vector<Argument>& arguments) = 0;
-
-  /**
-   * @brief Update the number of processed samples
-   */
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getBatchSize();
-  }
-
-  /// finish() should be called before distributeEval
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  void mergeResultsOfAllClients(ParameterClient2* client) {
-    double data[2] = {totalScore_, numSamples_};
-    client->reduce(data, data, 2, FLAGS_trainer_id, 0);
-    totalScore_ = data[0];
-    numSamples_ = data[1];
-  }
-
-  /**
-   * @brief finish the evaluation.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief print the statistics of evaluate result
-   * @note finish() should be called before printStats
-   */
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "="
-       << (numSamples_ ? totalScore_ / numSamples_ : 0);
-  }
-
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return os;
-  }
-
-  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
-                                   const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return std::move(os);
-  }
-
-  static ClassRegistrar<Evaluator> registrar_;
-
-  /**
-   * @brief getNames will return all field names of current evaluator.
-   *
-   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
-   * has multiple field, the name could be `evaluator_name.field1`. For example
-   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
-   * names will return `precision_recall_evaluator.precision`,
-   * `precision_recall_evaluator.recal`, etc.
-   *
-   * Also, if current Evaluator is a combined evaluator. getNames will return
-   * all names of all evaluators inside the combined evaluator.
-   *
-   * @param names [out]: the field names of current evaluator.
-   * @note Never clear the names parameter inside getNames.
-   */
-  virtual void getNames(std::vector<std::string>* names) {
-    names->push_back(config_.name());
-  }
-
-  /**
-   * @brief getValue will return the current evaluate value of one field.
-   *
-   * @param name: The field name of current evaluator.
-   * @param err [out]: The error state.
-   *
-   * @return The evaluate value(metric).
-   */
-  virtual real getValue(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return .0f;
-    }
-    return this->getValueImpl();
-  }
-
-  /**
-   * @brief getType will return the evaluator type by field name.
-   *
-   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
-   * 'precision_recall'. In combined evaluator, different name may get different
-   * evaluate type because it could be evaluated by different evaluator inside.
-   *
-   * @param name: The field name of current Evaluator.
-   * @param err: The error state. nullptr means don't care.
-   * @return the evaluator type string.
-   */
-  virtual std::string getType(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return std::string();
-    }
-    return this->getTypeImpl();
-  }
-
- protected:
-  /**
-   * @brief getValueImpl The simplest way to define getValue result. If this
-   * evaluator doesn't contain multiple fields, and do not throw any error, just
-   * implemented this method to get the evaluate result(metric).
-   * @return Evaluate result(metric).
-   */
-  virtual real getValueImpl() const {
-    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
-  }
-
-  /**
-   * @brief getTypeImpl The simplest way to define getType result. If this
-   * evaluator doesn't combine many evaluators, the get type should only return
-   * itself type.
-   * @return Evaluator type.
-   */
-  virtual std::string getTypeImpl() const { return "base"; }
-
- protected:
-  EvaluatorConfig config_;
-  double numSamples_;
-  double totalScore_;
-};
-
-/**
- * @brief The NotGetableEvaluator class is the base class of evaluator that
- * cannot get value in runtime. The most NotGetableEvaluator is Printer
- * Evaluator, which is only used to debug network configuration.
- */
-class NotGetableEvaluator : public Evaluator {
-  // Evaluator interface
- public:
-  void getNames(std::vector<std::string>* names) {}
-
-  real getValue(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return .0f;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return "";
-  }
-};
-
-class DummyEvaluator : public Evaluator {
- public:
-  DummyEvaluator() {}
-  virtual void init(const EvaluatorConfig&) {}
-  virtual void start() {}
-  virtual void eval(const NeuralNetwork&) {}
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    (void)arguments;
-    return -1;
-  }
-  virtual void finish() {}
-  virtual void printStats(std::ostream&) const {}
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const;
-};
-/**
- * @brief evaluate AUC using colIdx-th column as prediction.
- * The AUC(Area Under the Curve) is a common evaluation metric
- * for binary classification problems. It computes the area under
- * the receiver operating characteristic(ROC) curve.
- *
- * @note colIdx-th column
- *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is auc_evaluator.
- *
- */
-class AucEvaluator : public Evaluator {
- public:
-  AucEvaluator(int32_t colIdx)
-      : colIdx_(colIdx),
-        realColumnIdx_(0),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "=" << calcAuc();
-  }
-
-  virtual void distributeEval(ParameterClient2* client);
-
- private:
-  static const uint32_t kBinNum_ = (1 << 24) - 1;
-  static const int kNegativeLabel_ = 0;
-  double statPos_[kBinNum_ + 1];
-  double statNeg_[kBinNum_ + 1];
-  int32_t colIdx_;
-  uint32_t realColumnIdx_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  AucEvaluator() {}
-
-  inline static double trapezoidArea(double X1,
-                                     double X2,
-                                     double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  double calcAuc() const;
-
-  // Evaluator interface
- protected:
-  real getValueImpl() const;
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief RankAucEvaluator calculates the AUC of each list (i.e., titles
- * under the same query), and averages them. Each list should be organized
- * as a sequence. The inputs of this evaluator is [output, click, pv]. If pv
- * is not provided, it will be set to 1. The types of click and pv are
- * dense value.
- */
-class RankAucEvaluator : public Evaluator {
- public:
-  // evaluate ranking AUC
-  virtual void start();
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments);
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
- private:
-  MatrixPtr output_;
-  MatrixPtr click_;
-  MatrixPtr pv_;
-  std::vector<std::pair<real, int>> outputPair_;
-
-  double calcRankAuc(real* outputData,
-                     real* clickData,
-                     real* pvData,
-                     size_t size);
-
-  // Evaluator interface
- protected:
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief precision, recall and f1 score Evaluator
- * \f[
- * precision = \frac{tp}{tp+tn} \\
- * recall=\frac{tp}{tp+fn} \\
- * f1=2*\frac{precsion*recall}{precision+recall}
- * \f]
- *
- * The config file api is precision_recall_evaluator.
- */
-class PrecisionRecallEvaluator : public Evaluator {
- public:
-  // Evaluate precision, recall and F1 score
-  PrecisionRecallEvaluator()
-      : isMultiBinaryLabel_(false),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const;
-
-  virtual void distributeEval(ParameterClient2* client);
-
-  void getNames(std::vector<std::string>* names);
-
-  real getValue(const std::string& name, Error* err) const;
-
-  std::string getType(const std::string& name, Error* err) const;
-
-  struct StatsInfo {
-    /// numbers of true positives
-    double TP;
-    /// numbers of true negatives
-    double TN;
-    /// numbers of false positives
-    double FP;
-    /// numbers of false negatives
-    double FN;
-
-    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
-  };
-
- private:
-  bool isMultiBinaryLabel_;
-  std::vector<StatsInfo> statsInfo_;
-
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  struct PrintStatsInfo {
-    double precision;
-    double recall;
-    double f1;
-    double macroAvgPrecision;
-    double macroAvgRecall;
-    double macroAvgF1Score;
-    double microAvgPrecision;
-    double microAvgRecall;
-    double microAvgF1Score;
-  };
-
-  bool getStatsInfo(PrintStatsInfo* info) const;
-
-  void calcStatsInfo(const MatrixPtr& output,
-                     const IVectorPtr& label,
-                     const MatrixPtr& weight);
-
-  void calcStatsInfoMulti(const MatrixPtr& output,
-                          const MatrixPtr& label,
-                          const MatrixPtr& weight);
-
-  inline static double calcPrecision(double TP, double FP) {
-    if (TP > 0.0 || FP > 0.0) {
-      return TP / (TP + FP);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcRecall(double TP, double FN) {
-    if (TP > 0.0 || FN > 0.0) {
-      return TP / (TP + FN);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcF1Score(double precision, double recall) {
-    if (precision > 0.0 || recall > 0.0) {
-      return 2 * precision * recall / (precision + recall);
-    } else {
-      return 0;
-    }
-  }
-
-  mutable std::unordered_map<std::string, real> values_;
-
-  void storeLocalValues() const;
-};
-
-/*
- * @brief positive-negative pair rate Evaluator
- *
- * The config file api is pnpair_evaluator.
- */
-class PnpairEvaluator : public Evaluator {
- public:
-  PnpairEvaluator()
-      : cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuInfo_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label, int __queryid, real __weight)
-        : out(__out), label(__label), queryid(__queryid), weight(__weight) {}
-    real out;
-    int label;
-    int queryid;
-    real weight;
-  };
-  std::vector<PredictionResult> predictArray_;
-  void printPredictResults() {
-    std::ofstream fs(FLAGS_predict_file);
-    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
-    for (auto& res : predictArray_) {
-      fs << res.out << " " << res.label << " " << res.queryid << std::endl;
-    }
-  }
-
-  void stat(size_t start,
-            size_t end,
-            PredictionResult* answers,
-            double& pos,
-            double& neg,
-            double& spe);
-  void calc(std::vector<PredictionResult>& predictArray);
-
-  virtual void finish() { calc(predictArray_); }
-
-  virtual void printStats(std::ostream& os) const {
-    os << " pos/neg=" << this->getValueImpl();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    client->reduce(pairArray_, pairArray_, kPairArrayNum_, FLAGS_trainer_id, 0);
-    LOG(INFO) << " distribute eval calc total pos pair: " << pairArray_[0]
-              << " calc total neg pair: " << pairArray_[1];
-  }
-
- private:
-  static const uint32_t kPairArrayNum_ = 2;
-  double pairArray_[kPairArrayNum_];
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  IVectorPtr cpuInfo_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
- protected:
-  real getValueImpl() const {
-    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
-  }
-  std::string getTypeImpl() const;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
deleted file mode 100644
index 1c4034d8b..000000000
--- a/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GradientMachine.h"
-
-#include <fstream>
-#include "paddle/legacy/utils/Logging.h"
-
-#include "NeuralNetwork.h"
-#include "hl_gpu.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "GradientMachineMode.h"
-#include "MultiGradientMachine.h"
-#include "MultiNetwork.h"
-#include "ParallelNeuralNetwork.h"
-#endif
-
-namespace paddle {
-
-GradientMachine* GradientMachine::create(
-    const ModelConfig& config,
-    int mode,
-    const std::vector<ParameterType>& parameterTypes) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
-    return gm;
-  }
-  if (FLAGS_trainer_count > 1) {
-    return new MultiGradientMachine(config, FLAGS_use_gpu);
-  }
-#endif
-  if (FLAGS_trainer_count == 1) {  // single
-#ifndef PADDLE_MOBILE_INFERENCE
-    NeuralNetwork* nn;
-    if (config.type() == "multi_nn") {
-      /* multi submodel calculate, thread(s) will be initialized inside */
-      nn = new MultiNetwork("root");
-    } else if (FLAGS_parallel_nn) {
-      /* multi threads calculate */
-      nn = new ParallelNeuralNetwork();
-    } else {
-      /* single thread calculate */
-      nn = NeuralNetwork::create(config);
-    }
-#else
-    NeuralNetwork* nn = NeuralNetwork::create(config);
-#endif
-    ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
-      para->enableType(PARAMETER_VALUE);
-    };
-    nn->init(
-        config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes);
-    return nn;
-  }
-  LOG(FATAL) << "Unknown model type: " << config.type();
-  return nullptr;
-}
-
-void GradientMachine::saveParameters(const std::string& dir) const {
-  LOG(INFO) << "Saving parameters to " << dir;
-
-  for (auto& para : parameters_) {
-    std::string filename = dir + "/" + para->getName();
-    if (para->isFullSize()) {
-      para->save(filename);
-    }
-  }
-}
-
-void GradientMachine::loadParameters(const std::string& dir) {
-  LOG(INFO) << "Loading parameters from " << dir;
-
-  for (auto& para : parameters_) {
-    std::string filename = dir + "/" + para->getName();
-    if (para->isFullSize()) {
-      para->load(filename);
-    }
-  }
-}
-
-void GradientMachine::randParameters() {
-  LOG(INFO) << "Initing parameters..";
-
-  for (auto& para : parameters_) {
-    if (para->isFullSize()) {
-      para->randomize();
-    }
-  }
-  LOG(INFO) << "Init parameters done.";
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.h b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
deleted file mode 100644
index d4f754a9f..000000000
--- a/paddle/legacy/gserver/gradientmachines/GradientMachine.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <vector>
-
-#include "ModelConfig.pb.h"
-#include "TrainerConfig.pb.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
-#include "paddle/legacy/utils/Thread.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "paddle/legacy/gserver/evaluators/Evaluator.h"
-#endif
-
-namespace paddle {
-/**
- * @brief A gradient machine is capable of calculating some outputs given
- *        some inputs and performing gradient calculation based on the
- *        derivative from the outputs.
- *
- * A gradient machine can be either a full neural network or part of a neural
- * network.
- *
- * Usage for training:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Calculate gradient with respect to outArgs[i]->value
- *     and fill them into outArgs[i]->grad.
- *     This step can be skipped if your the outputs are from cost layers.
- *
- *  4. Call backward(). After backward, gradient of each parameter is
- *     accumulated to getParameters()[i]->getBuf(PARAMETER_GRADIENT)
- *
- *  5. Update parameter value getParameters()[i]->getBuf(PARAMETER_VALUE) using
- *     gradients.
- *
- *  6. Clear gradients to zero.
- *
- * Usage for prediction:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Obtain the prediction result from outArgs[i]
- */
-
-typedef std::vector<LayerStatePtr> MachineState;
-
-class GradientMachine;
-
-typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
-
-class GradientMachine {
- public:
-  enum CreateMode {
-    kNormal = 0,
-    kSgdSparseCpuTraining = 3,
-    kTesting = 4,
-    kCustom = 10
-  };
-
-  /**
-   * Create a gradient machine from ModelConfig
-   * Parameter will have parameterTypes
-   */
-  static GradientMachine* create(
-      const ModelConfig& config,
-      int mode = kNormal,
-      const std::vector<ParameterType>& parameterTypes =
-          std::vector<ParameterType>{
-              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
-
-  virtual ~GradientMachine() {}
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  virtual void prefetch(const std::vector<Argument>& inArgs) { (void)inArgs; }
-
-  /**
-   * @brief Forward propagation.
-   *
-   * Calculate outputs (outArgs) based the inputs (inArgs)
-   *
-   * @note: if passType==PASS_TEST, then backward() should not be called
-   */
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType) = 0;
-
-  /**
-   * @brief Backward propagation.
-   *
-   * Calculate the gradient of inArgs and parameter.
-   *
-   * This function should only be called after a corresponding forward() call.
-   * The caller is responsible for filling the correct grad for the outArgs
-   * obtained using forward().
-   *
-   * It may also change the grad field for the inArgs supplied at forward()
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * Combine forward() and backward(). For multithread training, this
-   * may be faster.
-   *
-   * @note: passType PASS_TEST is not allowed for forwardBackward().
-   */
-  virtual void forwardBackward(const std::vector<Argument>& inArgs,
-                               std::vector<Argument>* outArgs,
-                               PassType passType,
-                               const UpdateCallback& callback = nullptr) {
-    forward(inArgs, outArgs, passType);
-    backward(callback);
-  }
-
-  virtual Argument getLayerOutput(const std::string& layerName) = 0;
-
-  // see comment in Layer.h for the function with the same name
-  virtual void resetState() {}
-
-  // set machine state
-  virtual void setState(const MachineState& machineState) {}
-
-  // save machine state
-  virtual void getState(MachineState& machineState) {}
-
-  virtual void onPassEnd() = 0;
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  /**
-   * Create an evaluator which can be used for eval()
-   */
-  virtual Evaluator* makeEvaluator() const = 0;
-
-  /**
-   * evaluate using the given evaluator
-   */
-  virtual void eval(Evaluator* evaluator) const = 0;
-#endif
-
-  std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  std::vector<ParameterPtr>& getNonStaticParameters() {
-    if (nonStaticParameters_.empty()) {
-      for (auto para : parameters_) {
-        if (!para->isStatic()) {
-          nonStaticParameters_.push_back(para);
-        }
-      }
-    }
-    return nonStaticParameters_;
-  }
-
-  inline bool hasStaticParameters() {
-    return parameters_.size() != getNonStaticParameters().size();
-  }
-
-  /**
-   * @brief   Used before formal training, start work-threads and set
-   *          trainer Parameters;
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void start() {}
-
-  /**
-   * @brief   check  each work-thread whether is failed/error/finish,
-   *          if not, return ture, and yes return false.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief   set the training status a "finished" value, the sub_work_threads
-   *          will option the change, and then exit.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual bool trainIsOn() { return true; }
-
-  /**
-   * @brief   when all or some of the sub-workThreads are suspended to waiting
-   *          controller's instructions, and after some processing done in the
-   *          controller, it will call this function to wake up all the pending
-   *          thread.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void restart() {}
-
-  /// Set the gradient of the output from outside.
-  virtual void setOutputGrad(const std::vector<Argument>& args) {
-    LOG(FATAL) << "Not implemented!";
-  }
-
-  void saveParameters(const std::string& dir) const;
-
-  void loadParameters(const std::string& dir);
-
-  void randParameters();
-
-  virtual void getStats(real& cost, int64_t& numProcessed) {
-    (void)cost;
-    (void)numProcessed;
-  }
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  virtual void releaseOutput() {}
-
- protected:
-  virtual void onLoadParameter() {}
-
-  std::vector<ParameterPtr> parameters_;
-  std::vector<ParameterPtr> nonStaticParameters_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
deleted file mode 100644
index 9a0b2643e..000000000
--- a/paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GradientMachineMode.h"
-
-namespace paddle {
-std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
-    IGradientMachineMode::modes_;
-}
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
deleted file mode 100644
index dd944a35f..000000000
--- a/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "GradientMachine.h"
-#include "unordered_map"
-
-namespace paddle {
-
-class IGradientMachineMode {
- public:
-  virtual ~IGradientMachineMode() {}
-
- public:  // interfaces
-          /**
-           * @brief create current mode's gradient machine by model config.
-           * @param config model config
-           */
-  virtual GradientMachine* create(const ModelConfig& config) = 0;
-
-  /**
-   * @brief shouldBeMe the current mode of GradientMachine should be this mode.
-   * @param algo training algorithm name.
-   * @param trainerCount trainer count.
-   * @param isLocal is local mode (without pserver)
-   * @param isGpu is using gpu.
-   * @return true if mode should be this mode.
-   */
-  virtual bool shouldBeMe(const std::string& algo,
-                          size_t trainerCount,
-                          bool isLocal,
-                          bool isGpu) const = 0;
-
-  /**
-   * @brief Is data must be in cpu even if using gpu mode.
-   * @param trainerCount trainer count
-   * @return true if data must be gpu.
-   */
-  virtual bool isDataMustInCpu(size_t trainerCount) const = 0;
-
-  /**
-   * @brief Need not to use mini-batch method, and should train all data in one
-   * batch in one pass.
-   */
-  virtual bool needTrainWholeDataInOneBatch() const = 0;
-
- public:  // static methods.
-          /**
-           * @brief register a custom gradient machine mode.
-           * @note For user to register a custom gradient machine mode, id should >=
-           * kCustom.
-           * @param mode mode id.
-           * @param ptr mode description object.
-           */
-  static void regGradientMachineMode(
-      int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
-    modes_.insert(std::make_pair(mode, std::move(ptr)));
-  }
-
-  /**
-   * @brief get custom mode from mode id.
-   * @param mode mode id
-   * @return mode description object.
-   */
-  static IGradientMachineMode* mode(int32_t mode) {
-    if (modes_.find(mode) != modes_.end()) {
-      return modes_[mode].get();
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief helper function to test trainWholeDataInOneBatch or not for mode
-   */
-  static bool trainWholeDataInOneBatch(int32_t mode) {
-    if (modes_.find(mode) != modes_.end()) {
-      return modes_[mode]->needTrainWholeDataInOneBatch();
-    } else {
-      return false;
-    }
-  }
-
-  /**
-   * @brief Try to get custom mode if we can.
-   * @param [out] mode the custom mode id.
-   * @param [in] algo algorithm name
-   * @param [in] trainerCount trainer count.
-   * @param [in] isLocal is local or not
-   * @param [in] isGpu using gpu or not.
-   * @return true if there is a custom mode fit these conditions.
-   */
-  static bool tryGetMode(int* mode,
-                         const std::string& algo,
-                         int32_t trainerCount,
-                         bool isLocal,
-                         bool isGpu) {
-    for (auto it = modes_.begin(); it != modes_.end(); ++it) {
-      if (it->second->shouldBeMe(algo, trainerCount, isLocal, isGpu)) {
-        *mode = it->first;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  /**
-   * @brief helper function for data must in cpu
-   */
-  static bool dataMustInCpu(int32_t mode, size_t trainerCount) {
-    if (modes_.find(mode) != modes_.end()) {
-      return modes_[mode]->isDataMustInCpu(trainerCount);
-    } else {
-      // provide data to cpu if using synchronized multi-gpu gradient machine.
-      return trainerCount > 1;
-    }
-  }
-
-  /**
-   * @brief try to create gradient machine by mode & config.
-   * @return nullptr if we cannot create a gradient machine by such mode.
-   */
-  static GradientMachine* tryCreateGradientMachine(int32_t mode,
-                                                   const ModelConfig& config) {
-    auto m = IGradientMachineMode::mode(mode);
-    if (m) {
-      return m->create(config);
-    } else {
-      return nullptr;
-    }
-  }
-
- private:
-  static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
-      modes_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
deleted file mode 100644
index 3ef0dfbfe..000000000
--- a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
+++ /dev/null
@@ -1,898 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiGradientMachine.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Stat.h"
-
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
-
-DEFINE_bool(allow_only_one_model_on_one_gpu,
-            true,
-            "If true, do not allow multiple models on one GPU device");
-
-namespace paddle {
-
-// get types of the parameters which need to be merged after backward()
-static void fillMergeTypes(PassType passType,
-                           std::vector<ParameterType>* mergeTypes) {
-  mergeTypes->clear();
-  if (passType != PASS_TEST) {
-    mergeTypes->push_back(PARAMETER_GRADIENT);
-  }
-}
-
-MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
-                                           bool useGpu)
-    : useGpu_(useGpu),
-      trainerBarrier_(FLAGS_trainer_count),
-      allBarrier_(FLAGS_trainer_count + 1),
-      inArgsCopied_(false) {
-  isPassGrad_ = false;
-  numThreads_ = FLAGS_trainer_count;
-  if (useGpu) {
-    //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
-    //! the hl_get_device_count will get an error result. It seems should return
-    //! 0 when hppl is not compiled as gpu version.
-    numDevices_ = hl_get_device_count();
-  } else {
-    numDevices_ = 0;
-  }
-  ParamInitCallback mainParamInitCb = [](int paramId, Parameter* para) {
-    // only create buf for CPU parameters
-    // GPU parameters will be created in each thread
-    if (para->useGpu()) return;
-
-    if (para->isSparseRemoteUpdate()) {
-      para->enableType(PARAMETER_VALUE,
-                       FLAGS_loadsave_parameters_in_pserver
-                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
-    } else if (para->isGradSparseUpdate()) {
-      para->enableType(PARAMETER_VALUE);
-      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
-      SparseRowIdsCpuMatrix* mat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
-      mat->setNumOfThreads(FLAGS_trainer_count);
-    } else if (para->isValueShared()) {
-      para->enableType(PARAMETER_VALUE, Parameter::MAT_VALUE_SHARED);
-      if (!para->isStatic()) {
-        para->enableType(PARAMETER_GRADIENT);
-      }
-    } else {
-      para->enableType(PARAMETER_VALUE);
-      if (!para->isStatic()) {
-        para->enableType(PARAMETER_GRADIENT);
-      }
-    }
-  };
-
-  NeuralNetwork* nn = NeuralNetwork::create(config);
-  nn->init(config, mainParamInitCb);
-  gradientMachine_.reset(nn);
-  parameters_ = gradientMachine_->getParameters();
-
-  numLogicalDevices_ = 0;
-  if (useGpu_) {
-    numLogicalDevices_ = 1;
-
-    for (size_t pid = 0; pid < parameters_.size(); pid++) {
-      if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
-        numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
-      }
-    }
-    LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
-              << " numThreads=" << numThreads_ << " numDevices=" << numDevices_;
-
-    if (numLogicalDevices_ * numThreads_ > numDevices_ &&
-        FLAGS_allow_only_one_model_on_one_gpu) {
-      LOG(FATAL) << "trainer_count * num_devices_in_model "
-                 << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
-                 << "=" << numThreads_ * numLogicalDevices_
-                 << " exceeds number of GPU devices(" << numDevices_ << ")";
-    }
-    numLogicalDevices_ = std::min(numLogicalDevices_, numDevices_);
-
-    /* Enables direct access to memory allocations on a peer device */
-    for (int i = 0; i < numThreads_; i++) {
-      for (int d = 0; d < numLogicalDevices_; ++d) {
-        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
-                         logicalDeviceId2RealDeviceId(d, i + 1));
-        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
-                         logicalDeviceId2RealDeviceId(d, i - 1));
-      }
-    }
-  }
-
-  for (int i = 0; i < numThreads_; ++i) {
-    threads_.emplace_back(new TrainerThread(config, i, this));
-  }
-
-  bufferSizes_.resize(numLogicalDevices_, 0);
-  paraMainThread_.reserve(parameters_.size());
-  int pid = 0;
-  for (auto& para : parameters_) {
-    if (para->isStatic() || !para->useGpu()) {
-      paraMainThread_.push_back(0);
-    } else {
-      int end = pid++ % numThreads_;
-      paraMainThread_.push_back(end);
-      int paraDeviceId = para->getDeviceId();
-      if (paraDeviceId == -1) paraDeviceId = 0;
-      paraDeviceId = paraDeviceId % numLogicalDevices_;
-      if (para->getSize() > bufferSizes_[paraDeviceId]) {
-        bufferSizes_[paraDeviceId] = para->getSize();
-        VLOG(1) << "bufferSize[" << paraDeviceId << "]" << para->getSize();
-      }
-    }
-  }
-
-  // TODO(xuwei06) Instead of using maximal buffer size, we may use a smaller
-  // fixed buffer size and use pipeline to dispatch parameter value and merge
-  // parameter gradient, which may be faster.
-
-  // combination of all trainers mainPara into GradientMachine parameters
-  hasNonstaticCpuParamters_ = false;
-  for (size_t pid = 0; pid < parameters_.size(); pid++) {
-    if (parameters_[pid]->useGpu()) {
-      parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
-    } else if (!parameters_[pid]->isStatic()) {
-      hasNonstaticCpuParamters_ = true;
-    }
-  }
-
-  gradBufs_.resize(numThreads_);
-  for (int i = 0; i < numThreads_; ++i) {
-    gradBufs_[i].resize(numLogicalDevices_);
-    for (int d = 0; d < numLogicalDevices_; ++d) {
-      gradBufs_[i][d].sem.post();
-    }
-  }
-
-  outArgStream_ = HPPL_STREAM_1;
-
-  start();
-}
-
-void MultiGradientMachine::start() {
-  for (auto& thread : threads_) {
-    thread->start();
-  }
-}
-
-void MultiGradientMachine::finish() {
-  for (auto& thread : threads_) {
-    thread->stop();
-  }
-}
-
-std::vector<const std::vector<ParameterPtr>*>
-MultiGradientMachine::getSlaveParameters() {
-  std::vector<const std::vector<ParameterPtr>*> vec;
-  vec.reserve(threads_.size());
-  for (auto& thread : threads_) {
-    vec.push_back(&thread->getParameters());
-  }
-  return vec;
-}
-
-void MultiGradientMachine::notifyGradientTransfer(int paramId) {
-  gradQueue_.enqueue(paramId);
-}
-
-void MultiGradientMachine::allocGradBufs() {
-  if (numLogicalDevices_ == 0) return;
-  if (gradBufs_[0][0].bufs.size() >= mergeTypes_.size()) return;
-
-  for (int i = 0; i < numThreads_; i++) {
-    for (int d = 0; d < numLogicalDevices_; ++d) {
-      if (bufferSizes_[d] == 0) continue;
-      SetDevice device(logicalDeviceId2RealDeviceId(d, i));
-      for (size_t j = 0; j < mergeTypes_.size(); j++) {
-        gradBufs_[i][d].bufs.push_back(
-            Vector::create(bufferSizes_[d], /* useGpu= */ true));
-      }
-    }
-  }
-}
-
-void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
-  // Each gradient machine in threads needs to do prefetch on its own
-  // part of inArgs. So we need to first divide inArgs to each thread
-  inArgs_ = inArgs;
-  startTask(TASK_COPY_IN_ARGS);
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
-      mat->clearIndices();
-    }
-  }
-
-  waitForCopyInArgs();
-
-  // Because SparsePrefetchRowCpuMatrix can only be changed by ONE thread
-  // at one time, we need to do prefetch sequentially
-  for (auto& thread : threads_) {
-    thread->prefetch();
-  }
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
-      mat->setupIndices();
-      auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
-      matGrad->reserveStore();
-    }
-  }
-}
-
-void MultiGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>* outArgs,
-                                   PassType passType) {
-  forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
-}
-
-void MultiGradientMachine::forwardImp(const std::vector<Argument>& inArgs,
-                                      std::vector<Argument>* outArgs,
-                                      PassType passType,
-                                      TaskType taskType) {
-  updateThreadParameters();
-  passType_ = passType;
-
-  if (!inArgsCopied_) {
-    inArgs_ = inArgs;
-    inArgsCopied_ = false;
-  }
-
-  fillMergeTypes(passType, &mergeTypes_);
-  allocGradBufs();
-  startTask(taskType);
-
-  getOutArgs(outArgs, passType);
-}
-
-void MultiGradientMachine::backward(const UpdateCallback& callback) {
-  backwardCallback_ = callback;
-  startTask(TASK_BACKWARD);
-  backwardImp(callback);
-}
-
-void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
-                                           std::vector<Argument>* outArgs,
-                                           PassType passType,
-                                           const UpdateCallback& callback) {
-  backwardCallback_ = callback;
-  forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
-  backwardImp(callback);
-}
-
-Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
-  std::vector<Argument> args;
-  args.reserve(threads_.size());
-
-  for (auto& thread : threads_) {
-    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
-  }
-  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
-
-  return outLayerArgs_;
-}
-
-void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
-  for (size_t i = 0; i < parameters_.size(); i++) {
-    if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
-    REGISTER_TIMER("controller_dequeue");
-    gradQueue_.dequeue();
-  }
-  if (hasNonstaticCpuParamters()) {
-    waitAfterMerge();
-    if (backwardCallback_) {
-      for (auto& para : parameters_) {
-        if (!para->useGpu() && !para->isStatic()) {
-          backwardCallback_(para.get());
-        }
-      }
-    }
-  }
-}
-
-void MultiGradientMachine::updateThreadParameters() {
-  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
-    if (!parameters_[pid]->useGpu()) continue;
-    if (!parameters_[pid]->isValueUpdated()) continue;
-    parameters_[pid]->clearValueUpdated();
-    for (int i = 0; i < (int)threads_.size(); i++) {
-      threads_[i]->incUpdateCounter();
-    }
-    // NotifyValueReady should happen after that all threads' incUpdateCounter()
-    // are called so that the counters are correct when notifyValueReady()
-    // is called.
-    threads_[paraMainThread_[pid]]->notifyValueReady(pid);
-  }
-}
-
-void MultiGradientMachine::onPassEnd() {
-  for (auto& thread : threads_) {
-    thread->onPassEnd();
-  }
-}
-
-Evaluator* MultiGradientMachine::makeEvaluator() const {
-  return threads_[0]->getGradientMachine()->makeEvaluator();
-}
-
-void MultiGradientMachine::eval(Evaluator* evaluator) const {
-  for (auto& thread : threads_) {
-    SetDevice device(thread->getDeviceId());
-    if (thread->hasInputData()) {
-      thread->getGradientMachine()->eval(evaluator);
-    }
-  }
-}
-
-void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
-                                      PassType passType) {
-  for (auto& thread : threads_) {
-    REGISTER_TIMER("waitOutArgs");
-    thread->waitOutArgsReady();
-  }
-
-  outArgs_.resize(threads_[threads_.size() - 1]->getOutArgs().size());
-
-  REGISTER_TIMER("copyOutArgs");
-  for (size_t i = 0; i < outArgs_.size(); ++i) {
-    std::vector<Argument> args;
-    args.reserve(threads_.size());
-    for (auto& thread : threads_) {
-      // If the thread input is empty, then the output is empty.
-      auto tmp = thread->getOutArgs();
-      if (tmp.size() > 0) {
-        args.push_back(tmp[i]);
-      }
-    }
-    outArgs_[i].concat(args, useGpu_, outArgStream_, passType);
-  }
-
-  if (useGpu_) {
-    hl_stream_synchronize(outArgStream_);
-  }
-
-  *outArgs = outArgs_;
-}
-
-void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
-  CHECK_EQ(args.size(), outArgs_.size());
-  for (size_t i = 0; i < args.size(); i++) {
-    outArgs_[i].grad = args[i].grad;
-  }
-}
-
-void MultiGradientMachine::startTask(TaskType taskType) {
-  taskType_ = taskType;
-  for (auto& thread : threads_) {
-    thread->notifyTaskReady();
-  }
-}
-
-TrainerThread::TrainerThread(const ModelConfig& config,
-                             int threadId,
-                             MultiGradientMachine* multiMachine)
-    : multiMachine_(multiMachine),
-      config_(config),
-      threadId_(threadId),
-      inArgsCopied_(false) {
-  int numThreads = multiMachine->getNumThreads();
-
-  auto& mainParas = multiMachine->getParameters();
-
-  using std::placeholders::_1;
-  using std::placeholders::_2;
-
-  partnerId_ = mod(threadId_ - 1, numThreads);
-
-  deviceId_ = !multiMachine_->useGpu()
-                  ? -1
-                  : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
-  SetDevice gpuDevice(deviceId_);
-
-  NeuralNetwork* nn = nullptr;
-  if (!multiMachine->useGpu() || !FLAGS_parallel_nn) {
-    nn = NeuralNetwork::create(config);
-  } else {
-    nn = new ParallelNeuralNetwork();
-    for (auto& paraConfig : *config_.mutable_parameters()) {
-      if (paraConfig.device() != -1) {
-        paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
-            paraConfig.device(), threadId_));
-      }
-    }
-    for (auto& layerConfig : *config_.mutable_layers()) {
-      if (layerConfig.device() != -1) {
-        layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
-            layerConfig.device(), threadId_));
-      }
-    }
-  }
-  // Only GPU do not share parameter values with main paramters.
-  ParamInitCallback slaveParamInitCb =
-      std::bind(parameterInitNN, _1, _2, &mainParas);
-  nn->init(config_, slaveParamInitCb);
-  gradientMachine_.reset(nn);
-  parameters_ = gradientMachine_->getParameters();
-  if (!FLAGS_parallel_nn) {
-    for (auto& para : parameters_) {
-      para->setDevice(deviceId_);
-    }
-  }
-
-  backwardCallback_ =
-      std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1);
-
-  gradStream_ = HPPL_STREAM_2;
-  valueStream_ = HPPL_STREAM_3;
-  stopping_ = true;
-  updateCounter_ = 0;
-  parameterUpdated_ = false;
-}
-
-TrainerThread::~TrainerThread() { stop(); }
-
-void TrainerThread::start() {
-  if (!stopping_) return;
-
-  stopping_ = false;
-
-  gradientMachine_->start();
-
-  computeThread_.reset(new std::thread([this]() { computeThread(); }));
-
-  if (multiMachine_->useGpu()) {
-    gradCollectThread_.reset(
-        new std::thread([this]() { gradCollectThread(); }));
-
-    valueDispatchThread_.reset(
-        new std::thread([this]() { valueDispatchThread(); }));
-
-    copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); }));
-  }
-}
-
-void TrainerThread::stop() {
-  if (stopping_) return;
-
-  stopping_ = true;
-
-  if (computeThread_) {
-    taskReadySem_.post();
-    computeThread_->join();
-  }
-  if (gradCollectThread_) {
-    gradQueue_.enqueue(0);
-    gradCollectThread_->join();
-  }
-  if (copyThread_) {
-    gradBufQueue_.enqueue(0);
-    copyThread_->join();
-  }
-  if (valueDispatchThread_) {
-    valueReadyQueue_.enqueue(0);
-    valueDispatchThread_->join();
-  }
-}
-
-void TrainerThread::computeThread() {
-  VLOG(1) << "gradComputeThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  while (true) {
-    {
-      REGISTER_TIMER("taskSem_wait");
-      taskReadySem_.wait();
-    }
-
-    if (stopping_) break;
-
-    switch (multiMachine_->getTaskType()) {
-      case MultiGradientMachine::TASK_FORWARD_BACKWARD:
-        forward();
-        backward();
-        break;
-      case MultiGradientMachine::TASK_FORWARD:
-        forward();
-        break;
-      case MultiGradientMachine::TASK_BACKWARD:
-        backward();
-        break;
-      case MultiGradientMachine::TASK_COPY_IN_ARGS:
-        batchSize_ = copyInArgs();
-        inArgsCopied_ = true;
-        multiMachine_->waitForCopyInArgs();
-        break;
-    }
-  }
-  hl_fini();
-}
-
-void TrainerThread::prefetch() {
-  SetDevice setDevice(deviceId_);
-  gradientMachine_->prefetch(inArgs_);
-}
-
-void TrainerThread::forward() {
-  if (!inArgsCopied_) {
-    REGISTER_TIMER("copyInArgs");
-    batchSize_ = copyInArgs();
-  } else {
-    inArgsCopied_ = false;
-  }
-
-  if (multiMachine_->getPassType() != PASS_TEST) {
-    REGISTER_TIMER("clearGradient");
-    // For main parameter, the user of MultiGpuSyncMachine is responsible
-    // for setting the gradient to zero
-    for (size_t i = 0; i < parameters_.size(); i++) {
-      if (parameters_[i]->useGpu()) {
-        if (multiMachine_->paraMainThread(i) != threadId_) {
-          SetDevice device(parameters_[i]->getDeviceId());
-          parameters_[i]->clearGradient();
-        }
-      } else {
-        parameters_[i]->clearGradient();
-      }
-    }
-  }
-
-  {
-    REGISTER_TIMER("wait_value");
-    valueReadyCond_.wait([this]() { return !parameterUpdated_; });
-  }
-
-  { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); }
-
-  {
-    REGISTER_TIMER("thread_forward");
-    if (batchSize_ > 0) {
-      gradientMachine_->forward(
-          inArgs_, &outArgs_, multiMachine_->getPassType());
-    } else {
-      outArgs_.clear();
-    }
-  }
-  outArgsReadySem_.post();
-}
-
-void TrainerThread::backward() {
-  REGISTER_TIMER("thread_backward");
-  if (multiMachine_->isPassGrad()) {
-    copyOutputGrad();
-  }
-  if (batchSize_ > 0) {
-    gradientMachine_->backward(backwardCallback_);
-  } else {
-    for (size_t i = parameters_.size(); i > 0; i--) {
-      backwardCallback(parameters_[i - 1].get());
-    }
-  }
-  if (multiMachine_->hasNonstaticCpuParamters()) {
-    mergeCpuGradients();
-  }
-}
-
-void TrainerThread::backwardCallback(Parameter* para) {
-  // CPU parameters are merged in the end
-  if (!para->useGpu() || para->isStatic()) return;
-
-  int paramId = para->getID();
-  if (multiMachine_->getNumThreads() == 1) {
-    // no need to do merge if there is only one thread
-    doCallback(paramId);
-  } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1,
-                              multiMachine_->getNumThreads())) {
-    notifyCopyGradToBuffer(paramId);
-  } else {
-    notifyGradientCollect(paramId);
-  }
-}
-
-void TrainerThread::copyGradToBufferThread() {
-  VLOG(1) << "copyGradToBufferThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-  auto& partnerThread = multiMachine_->getThread(partnerId_);
-  auto& gradBufs = multiMachine_->getGradBuf(partnerId_);
-
-  while (true) {
-    int pid = gradBufQueue_.dequeue();
-    if (stopping_) break;
-
-    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-        parameters_[pid]->getDeviceId(), threadId_);
-
-    auto& gradBuf = gradBufs[pdeviceId];
-
-    {
-      REGISTER_TIMER("waitBufferReady");
-      gradBuf.sem.wait();
-    }
-
-    {
-      REGISTER_TIMER("copyGradToBuffer");
-      SetDevice setDevice(parameters_[pid]->getDeviceId());
-      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
-        gradBuf.bufs[i]->resize(
-            parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
-        gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]),
-                                  gradStream_);
-      }
-      hl_stream_synchronize(gradStream_);
-    }
-    partnerThread->notifyGradientCollect(pid);
-  }
-  hl_fini();
-}
-
-void TrainerThread::gradCollectThread() {
-  VLOG(1) << "gradCollectThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  std::vector<size_t> gradReadyCount(parameters_.size(), 0);
-
-  auto& gradBufs = multiMachine_->getGradBuf(threadId_);
-
-  while (true) {
-    int pid = gradQueue_.dequeue();
-    if (stopping_) break;
-
-    if (++gradReadyCount[pid] < 2) continue;
-    gradReadyCount[pid] = 0;
-    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-        parameters_[pid]->getDeviceId(), threadId_);
-
-    auto& gradBuf = gradBufs[pdeviceId];
-
-    {
-      REGISTER_TIMER("mergeGrad");
-      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
-        ParameterType type = mergeTypes_[i];
-        const VectorPtr& localGrad = parameters_[pid]->getBuf(type);
-        SetDevice setDevice(parameters_[pid]->getDeviceId());
-        localGrad->add(*gradBuf.bufs[i]);
-      }
-    }
-
-    gradBuf.sem.post();
-
-    if (multiMachine_->paraMainThread(pid) == threadId_) {
-      doCallback(pid);
-    } else {
-      notifyCopyGradToBuffer(pid);
-    }
-  }
-  hl_fini();
-}
-
-void TrainerThread::doCallback(int pid) {
-  REGISTER_TIMER("callback");
-  auto& gpuThreads = multiMachine_->getAllThreads();
-  if (multiMachine_->getBackwardCallback()) {
-    // The callback supplied by the user of MultiGradientMachine may handle
-    // the parameter update using the gradient.
-    multiMachine_->getBackwardCallback()(parameters_[pid].get());
-    if (parameters_[pid]->isValueUpdated()) {
-      parameters_[pid]->clearValueUpdated();
-      for (auto& thread : gpuThreads) {
-        thread->incUpdateCounter();
-      }
-      notifyValueReady(pid);
-    }
-  }
-  multiMachine_->notifyGradientTransfer(pid);
-}
-
-void TrainerThread::valueDispatchThread() {
-  VLOG(1) << "valueDispatchThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  auto& thread = multiMachine_->getThread(partnerId_);
-
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("value_dequeue");
-      pid = valueReadyQueue_.dequeue();
-    }
-    if (stopping_) break;
-
-    if (multiMachine_->paraMainThread(pid) == partnerId_) continue;
-
-    {
-      REGISTER_TIMER("copyValue");
-      SetDevice setDevice(parameters_[pid]->getDeviceId());
-      thread->getValueBuf(pid)->copyFrom(*getValueBuf(pid), valueStream_);
-      hl_stream_synchronize(valueStream_);
-    }
-
-    thread->notifyValueReady(pid);
-  }
-  hl_fini();
-}
-
-void TrainerThread::notifyValueReady(int paramId) {
-  if (--updateCounter_ == 0) {
-    valueReadyCond_.notify_all([this] { parameterUpdated_ = false; });
-  }
-
-  notifyValueDispatch(paramId);
-}
-
-int TrainerThread::copyInArgs() {
-  const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
-  int numThreads = multiMachine_->getAllThreads().size();
-  int32_t numSequences = fullInArgs[0].getNumSequences();
-  int32_t startSeq = numSequences * threadId_ / numThreads;
-  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
-  int32_t copySize = endSeq - startSeq;
-
-  /**
-   * For the first copy, need to allocate space here
-   */
-  if (inArgs_.size() == 0) {
-    inArgs_.resize(fullInArgs.size());
-  }
-
-  if (copySize == 0) {
-    return 0;
-  }
-
-  for (size_t i = 0; i < fullInArgs.size(); i++) {
-    inArgs_[i].resizeAndCopyFrom(
-        fullInArgs[i],
-        startSeq,
-        copySize,
-        FLAGS_parallel_nn ? false : multiMachine_->useGpu());
-  }
-  return copySize;
-}
-
-void TrainerThread::mergeCpuGradients() {
-  CHECK_EQ(mergeTypes_.size(), 1UL);
-  CHECK_EQ(mergeTypes_[0], PARAMETER_GRADIENT);
-
-  {
-    REGISTER_TIMER("waitbeforeMerge");
-    multiMachine_->waitBeforeMerge();
-  }
-  std::vector<const std::vector<ParameterPtr>*> slaveParameters =
-      multiMachine_->getSlaveParameters();
-
-  CHECK(slaveParameters.size());
-  for (auto& para : multiMachine_->getNonStaticParameters()) {
-    if (para->useGpu()) continue;
-    if (para->isSparseRemoteUpdate()) {
-      REGISTER_TIMER("mergeRemoteGradSparse");
-      mergeGradSparseRemote(para.get(), slaveParameters);
-    } else if (para->isGradSparseUpdate()) {
-      REGISTER_TIMER("mergeGradSparse");
-      mergeGradSparse(para.get(), slaveParameters);
-    } else {
-      REGISTER_TIMER("mergeGradDense");
-      mergeGradDense(para.get(), slaveParameters);
-    }
-  }
-  {
-    REGISTER_TIMER("waitbeforeMerge");
-    multiMachine_->waitAfterMerge();
-  }
-}
-
-void TrainerThread::mergeGradSparse(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
-  std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
-
-  for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
-        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
-    mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
-    // we use a sample hash method(%) instead of range partition,
-    // because range partition has balance issue sometimes,
-    // when feature ids are not generated from hashcode.
-  }
-  uniqueIds(ids);
-}
-
-void TrainerThread::mergeGradSparseRemote(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  SparseRowCpuMatrix* mainMat =
-      dynamic_cast<SparseRowCpuMatrix*>(para->getMat(PARAMETER_GRADIENT).get());
-
-  mainMat->checkIndices();
-  mainMat->zeroMemThread(threadId_, multiMachine_->getNumThreads());
-
-  for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
-        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
-    mat->addTo(*mainMat, threadId_, multiMachine_->getNumThreads());
-  }
-}
-
-void TrainerThread::mergeGradDense(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  auto interval = calcSplitArrayInterval(para->getSize(),
-                                         (size_t)threadId_,
-                                         multiMachine_->getNumThreads(),
-                                         8LU /*for avx*/);
-  size_t startSeq = interval.first;
-  size_t copySize = interval.second - interval.first;
-
-  // setup sub bufs
-  CpuVector destGrad(0, nullptr);
-  destGrad.subVecFrom(*para->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
-
-  // merge
-  CpuVector slaveGradSub(0, nullptr);
-  for (auto slaveParams : slaveParameters) {
-    slaveGradSub.subVecFrom(
-        *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
-    destGrad.add(slaveGradSub);
-  }
-}
-
-void TrainerThread::copyOutputGrad() {
-  const std::vector<Argument>& outputGradArgs = multiMachine_->outArgs_;
-  int numThreads = multiMachine_->getAllThreads().size();
-  int32_t numSequences = outputGradArgs[0].getNumSequences();
-  int32_t startSeq = numSequences * threadId_ / numThreads;
-  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
-  int32_t copySize = endSeq - startSeq;
-  outArgs_.resize(outputGradArgs.size());
-  for (size_t i = 0; i < outputGradArgs.size(); i++) {
-    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i],
-                                  startSeq,
-                                  copySize,
-                                  multiMachine_->useGpu(),
-                                  HPPL_STREAM_DEFAULT);
-  }
-  if (multiMachine_->useGpu()) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-  gradientMachine_->setOutputGrad(outArgs_);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
deleted file mode 100644
index 674acd412..000000000
--- a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
+++ /dev/null
@@ -1,478 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-
-#include "GradientMachine.h"
-
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Queue.h"
-
-namespace paddle {
-
-class TrainerThread;
-
-typedef Queue<int> PidQueue;
-typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
-
-struct GradBuffer {
-  /// GradBuffer is used for gathering gradient for GPU parameters
-  int paramId;
-
-  /// sem is used to notify that the local gradient merge of the current thread
-  /// finished for the current thread.
-  Semaphore sem;
-
-  // bufs[mergeIndex]
-  std::vector<VectorPtr> bufs;
-};
-
-/**
- *  A MultiGradientMachine is a synchronous GradientMachine which devides
- *  one data batch into several smaller batches and assign each one small batch
- *  to one computint thread for computation. After each thread finishes
- *  computation, it merges result (including output Argument and gradient during
- *  backward()). It basically is the same as single thread gradient machine,
- *  except that it uses multi-thread to do the computation.
- *
- *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
- *  generally corresponds to one GPU device. Thus, each thread keeps a separate
- *  copy of the parameter in its own device's memory. In CPU, we only need to
- keep
- *  one copy of the parameters in the main memory. After, each computing thread
- *  computes its own parameter gradient, the update process needs to accumulate
- *  the parameter gradients from all the computing threads, and update the
- *  accumulated parameter gradient to the corresponding parameter value.
- *
- *  Each GPU parameter is assigned to a thread called its main thread. For each
- *  parameter, the accumulation of its gradients and the update of its value
- *  happens in its main thread. The main thread first gather the parameter
- *  gradients from all the computing thread. Then, it performs parameter update.
- *  After a gradient is updated by the main thread, it is scattered to all the
- *  computing thread so that the parameters in all the computing threads are
- *  synchronized. The scatter and gather process are implemented by ring-style
- *  communication. Assume we have N computing threads, its thread ids will be
- *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified
- in
- *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i
- only
- *  sends data to its partner thread (i - 1) % N. For example, for a parameter
- *  gradient that is computed in thread 4, and its main thread is 2. Its
- *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the
- gradient
- *  buffer is added to the local gradient, and the local gradient is then copied
- *  to the gradient buffer of the next thread. At last, its main thread 2 will
- *  get the accumulated parameter gradient. For the same parameter, after its
- *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ...
- 3.
- *  At the end, all the computing threads would have the updated parameter
- value.
- *
- *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
- *
- *  1. computeThread(): performing forward(), backward(), prefetch().
- *
- *  2. valueDispatchThread(): copying parameter values to partner thread.
- *
- *  3. copyGradToBufferThread(): copying parameter gradient to partner thread.
- *
- *  4. gradCollectThread(): merging the gradient from step 3 with local gradient
- *     and call the callback supplied by the user to update parameter value.
- *
- *  CPU parameter value has only one copy. And their gradients are merged at the
- *  end of backward().
- *
- *  * Handling of sparse update
- *  Currently, sparse update is only supported for CPU parameters.
-
- *  Sparse updates refers to gradient caculation where the gradient is sparse.
- For
- *  example, if the input argument to a 'fc' layer is sparse, the gradient of
- the
- *  weight matrix of this layer will be sparse. It is usually more efficient to
- *  treat the gradient explicitly as sparse vector during the parameter update.
-
- *  There are two types of sparse updates called local sparse update and remote
- *  sparse update.
-
- *  For both types of sparse updates, there is one copy of parameter value and
- *  gradient called main parameter value and gradient, and there is a copy of
- *  parameter value and gradient for each computing thread called slave
- parameter
- *  value and gradient. The slave parameter values are always shared with the
- *  corresponding main parameter value. The slave parameter grad is a sparse row
- *  matrix. The sparse pattern for slave parameter grads are different, because
- *  the small batches for each computing thread might have different sparsity
- *  pattern.
-
- *  1. Local sparse update
- *
- *     Main parameter value type is MAT_NORMAL. It is a dense matrix.
- *
- *     Main parameter grad type is MAT_SPARSE_ROW_IDS (SparseRowIdsCpuMatrix)
- *     It is also a dense matrix, but the updated values are specified by IDS.
- *
- *     Slave parameter value shares with main parameter value.
- *
- *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
- *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
- *
- *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be
- merged
- *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
- *     which rows have nonzero gradient.
- *
- *  2. Remote sparse update
- *
- *     Main parameter value type is MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE)
- *     (SparsePrefetchRowCpuMatrix). MAT_SPARSE_ROW_PREFETCH is a sparse matrix.
- *     MAT_SPARSE_ROW_PREFETCH_FULL_SIZE is a dense matrix. However, only the
- *     parameter values that are prefetched is up-to-date.
- *
- *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
- *     And it shares sparse pattern with value by sharing indexDictHandle_,
- which
- *     is an internal data structure used by SparseRowCpuMatrixto specify the
- *     sparsity pattern of Slave parameter value shares with main parameter
- value.
- *
- *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
- *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
- *
- *     During prefetch(), all the layers will indicates which rows of each
- *     parameter are needed. Then the framework will retrieve those rows from
- *     parameter server.
- *
- *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be
- merged
- *     into main parameter grad (SparseRowCpuMatrix). And the framework will
- send
- *     the merged gradient to parameter server.
- */
-class MultiGradientMachine : public GradientMachine {
- public:
-  enum TaskType {
-    TASK_FORWARD_BACKWARD = 0,
-    TASK_FORWARD = 1,
-    TASK_BACKWARD = 2,
-    TASK_COPY_IN_ARGS = 3,
-  };
-
-  explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
-
-  virtual void start();
-
-  virtual void finish();
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual Argument getLayerOutput(const std::string& layerName);
-
-  virtual void onPassEnd();
-
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-
-  bool useGpu() const { return useGpu_; }
-
-  /// @return whether to pass the gradients in outArgs_ to each threads.
-  bool isPassGrad() { return isPassGrad_; }
-
-  /// @brief set whether to pass the gradient in outArgs_ to each threads.
-  void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
-
-  /// Set the gradients of the outputs.
-  /// The gradietns will be copied to each thread in the computing threads.
-  virtual void setOutputGrad(const std::vector<Argument>& args);
-
- protected:
-  friend class TrainerThread;
-
-  std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
-  /// Calculate the real device id based on the logical device id and the
-  /// thread id.
-  int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
-    if (logicalId == -1) {
-      logicalId = 0;
-    }
-    return mod(logicalId + FLAGS_gpu_id + threadId * numLogicalDevices_,
-               numDevices_);
-  }
-
-  /// Calculate the logical device id based on the real device id and the
-  /// thread id.
-  int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
-    if (realId == -1) {
-      return 0;
-    } else {
-      return mod(realId - FLAGS_gpu_id - threadId * numLogicalDevices_,
-                 numDevices_);
-    }
-  }
-
-  std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
-
-  bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; }
-
-  /// Called TrainerThread to wait before merging CPU parameter gradients.
-  void waitBeforeMerge() { trainerBarrier_.wait(); }
-
-  /// called by MultiGradientMachine and TrainerThread to wait after merging
-  /// CPU parameter graidents.
-  void waitAfterMerge() { allBarrier_.wait(); }
-
-  /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
-  /// finishing
-  void waitForCopyInArgs() { allBarrier_.wait(); }
-
-  TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; }
-
-  std::vector<GradBuffer>& getGradBuf(int threadId) {
-    return gradBufs_[threadId];
-  }
-
-  PassType getPassType() const { return passType_; }
-
-  /// Called by TrainerThread to notify MultiGradientMachine that the gradient
-  /// for paramId is ready
-  void notifyGradientTransfer(int paramId);
-
-  const std::vector<Argument>& getInArgs() { return inArgs_; }
-
-  TaskType getTaskType() const { return taskType_; }
-
-  const UpdateCallback& getBackwardCallback() const {
-    return backwardCallback_;
-  }
-
-  int getNumDevices() const { return numDevices_; }
-
-  int getNumLogicalDevices() const { return numLogicalDevices_; }
-
-  int getNumThreads() const { return numThreads_; }
-
-  int paraMainThread(int pid) const { return paraMainThread_[pid]; }
-
- protected:
-  virtual void forwardImp(const std::vector<Argument>& inArgs,
-                          std::vector<Argument>* outArgs,
-                          PassType passType,
-                          TaskType taskType);
-
-  virtual void backwardImp(const UpdateCallback& callback = NULL);
-
-  /// update all parameters
-  void updateThreadParameters();
-
-  void startTask(TaskType taskType);
-
-  void getOutArgs(std::vector<Argument>* outArgs, PassType passType);
-
-  void allocGradBufs();
-
- protected:
-  bool useGpu_;
-
-  bool hasNonstaticCpuParamters_;
-
-  /// store main parameter only
-  std::unique_ptr<GradientMachine> gradientMachine_;
-
-  std::vector<TrainerThreadPtr> threads_;
-  std::vector<int> paraMainThread_;
-  std::vector<std::vector<GradBuffer>> gradBufs_;  // [threadId][deviceId]
-  std::vector<size_t> bufferSizes_;
-
-  PassType passType_;
-  TaskType taskType_;
-  PidQueue gradQueue_;
-  std::vector<Argument> inArgs_;
-  std::vector<Argument> outArgs_;
-  hl_stream_t outArgStream_;
-
-  Argument outLayerArgs_;
-
-  /// ParameterType which needs to be merged from each GPU
-  std::vector<ParameterType> mergeTypes_;
-  int numDevices_;         /* number of gpu devices */
-  int numLogicalDevices_;  // number of GPU used by one NN
-  int numThreads_;         /* number of train threads */
-
-  UpdateCallback backwardCallback_;
-
-  /// barrrier for threads_
-  ThreadBarrier trainerBarrier_;
-
-  /// barrier for both MultiGradientMachine and threds_
-  ThreadBarrier allBarrier_;
-
-  /// indicate whether inArgs is copied before forward()
-  bool inArgsCopied_;
-
-  /// Whether to copy the gradient back from an external input.
-  bool isPassGrad_;
-};
-
-class TrainerThread {
- public:
-  TrainerThread(const ModelConfig& config,
-                int threadId,
-                MultiGradientMachine* multiMachine);
-
-  ~TrainerThread();
-
-  void start();
-
-  void onPassEnd() { gradientMachine_->onPassEnd(); }
-
-  void waitOutArgsReady() { outArgsReadySem_.wait(); }
-
-  void notifyTaskReady() { taskReadySem_.post(); }
-
-  int getDeviceId() const { return deviceId_; }
-
-  GradientMachine* getGradientMachine() { return gradientMachine_.get(); }
-
-  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  void stop();
-
-  void notifyValueReady(int paramId);
-
-  const VectorPtr& getValueBuf(int paramId) {
-    return parameters_[paramId]->getBuf(PARAMETER_VALUE);
-  }
-
-  const std::vector<Argument>& getOutArgs() { return outArgs_; }
-
-  void incUpdateCounter(int n = 1) {
-    updateCounter_ += n;
-    parameterUpdated_ = true;
-  }
-
-  void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); }
-
-  void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); }
-
-  void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); }
-
-  void prefetch();
-
-  /// copy the output gradient from the main GradientMachine.
-  void copyOutputGrad();
-
-  /// Whether the thread has input data.
-  bool hasInputData() { return batchSize_ != 0; }
-
- protected:
-  void mergeCpuGradients();
-
-  void mergeGradSparse(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void mergeGradSparseRemote(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void mergeGradDense(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void computeThread();
-  void valueDispatchThread();
-  void copyGradToBufferThread();
-  void gradCollectThread();
-
-  int copyInArgs();
-  void forward();
-  void backward();
-  void backwardCallback(Parameter* para);
-
-  /// call the actuall callback supplied by the caller of
-  /// GradientMachine::backward
-  void doCallback(int pid);
-
- protected:
-  MultiGradientMachine* multiMachine_;
-  ModelConfig config_;
-  /// whether the thread should stop
-  bool stopping_;
-  /// the threads form which to collect gradient
-  int partnerId_;
-  /// from 0 to threads-1
-  int threadId_;
-  int deviceId_;
-  std::unique_ptr<GradientMachine> gradientMachine_;
-  std::vector<ParameterPtr> parameters_;
-
-  /// ParameterType which needs to be merged from each GPU
-  std::vector<ParameterType> mergeTypes_;
-
-  /// compute thread
-  std::unique_ptr<std::thread> computeThread_;
-  std::vector<Argument> inArgs_;
-  std::vector<Argument> outArgs_;
-  Semaphore taskReadySem_;
-  Semaphore outArgsReadySem_;
-
-  /// copy thread
-  std::unique_ptr<std::thread> copyThread_;
-  /// queue of gradient needs to be copied to partner
-  PidQueue gradBufQueue_;
-  hl_stream_t gradStream_;
-
-  /// grad merge thread
-  std::unique_ptr<std::thread> gradCollectThread_;
-  /// queue of gradient needs to be merged with gradient coopied by
-  /// copyGradToBufferThread
-  PidQueue gradQueue_;
-  UpdateCallback backwardCallback_;
-
-  /// value dispatch thread
-  std::unique_ptr<std::thread> valueDispatchThread_;
-  /// queue of the parameter whose the vale are ready for copy
-  PidQueue valueReadyQueue_;
-
-  /// used to notify all the parameter values are ready
-  LockedCondition valueReadyCond_;
-
-  hl_stream_t valueStream_;
-  /// how many parameters are updated
-  std::atomic<int> updateCounter_;
-  bool parameterUpdated_;
-
-  /// indicate whether inArgs is copied before forward()
-  bool inArgsCopied_;
-  int batchSize_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp b/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
deleted file mode 100644
index 1245c4410..000000000
--- a/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "MultiNetwork.h"
-
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
-
-namespace paddle {
-
-void MultiNetwork::init(const ModelConfig& config,
-                        ParamInitCallback callback,
-                        const std::vector<ParameterType>& parameterTypes,
-                        bool useGpu) {
-  CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
-  // check submodel[0] is root
-  CHECK_EQ("root", config.sub_models(0).name())
-      << "sub_models(0) should be root";
-  // ignore root
-  subNetworks_.resize(config.sub_models_size() - 1);
-  // base class
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-  // sub networks
-  for (int i = 1; i < config.sub_models_size(); ++i) {
-    std::string subModelName = config.sub_models(i).name();
-    if (FLAGS_parallel_nn) {
-      subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
-          new ParallelNeuralNetwork(subModelName, this));
-    } else {
-      subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
-          NeuralNetwork::newNeuralNetwork(subModelName, this));
-    }
-    subNetworks_[i - 1]->init(config);
-  }
-}
-
-void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
-  std::vector<std::vector<Argument>> argumentGroups;
-  Argument::splitByDataId(inArgs, &argumentGroups);
-  // check group size is equal to sub network size
-  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
-      // check input args: if dataId is -1, then skip this sub network
-      continue;
-    }
-    subNetworks_[i]->prefetch(argumentGroups[i]);
-  }
-}
-
-void MultiNetwork::forward(const std::vector<Argument>& inArgs,
-                           std::vector<Argument>* outArgs,
-                           PassType passType) {
-  // split inArgs to several vectors
-  std::vector<std::vector<Argument>> argumentGroups;
-  Argument::splitByDataId(inArgs, &argumentGroups);
-
-  // check group size is equal to sub network size
-  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
-  std::vector<Argument> tempOutArgs;
-  outArgs->clear();
-
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    tempOutArgs.clear();
-    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
-      // check input args: if dataId is -1, then skip this sub network
-      continue;
-    }
-    subNetworks_[i]->forward(argumentGroups[i], &tempOutArgs, passType);
-    for (const auto& elem : tempOutArgs) {
-      outArgs->push_back(elem);
-      outArgs->back().dataId = i;
-    }
-  }
-}
-
-void MultiNetwork::backward(const UpdateCallback& callback) {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->backward(callback);
-  }
-}
-
-void MultiNetwork::forwardBackward(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>* outArgs,
-                                   PassType passType,
-                                   const UpdateCallback& callback) {
-  forward(inArgs, outArgs, passType);
-  backward(callback);
-}
-
-void MultiNetwork::onPassEnd() {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->onPassEnd();
-  }
-}
-
-void MultiNetwork::start() {
-  for (auto& subNetwork : subNetworks_) {
-    subNetwork->start();
-  }
-}
-
-void MultiNetwork::finish() {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->finish();
-  }
-}
-
-class MultiCombinedEvaluator : public Evaluator {
- public:
-  MultiCombinedEvaluator() {}
-  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
-    evaluators_.emplace_back(std::move(evaluator));
-  }
-  virtual void start() {
-    for (auto& evaluator : evaluators_) {
-      evaluator->start();
-    }
-  }
-
-  virtual void finish() {
-    for (auto& evaluator : evaluators_) {
-      evaluator->finish();
-    }
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    const MultiNetwork& multiNetwork = dynamic_cast<const MultiNetwork&>(nn);
-    CHECK_EQ(evaluators_.size(), multiNetwork.getSubNetworks().size());
-    int size = evaluators_.size();
-    for (int i = 0; i < size; i++) {
-      // one evaluator for one subNetwork
-      evaluators_[i]->eval(*multiNetwork.getSubNetworks()[i]);
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    (void)arguments;
-    return -1;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    for (auto& evaluator : evaluators_) {
-      evaluator->printStats(os);
-      os << ' ';
-    }
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    for (auto& evaluator : evaluators_) {
-      evaluator->distributeEval(client);
-    }
-  }
-
- protected:
-  std::vector<std::unique_ptr<Evaluator>> evaluators_;
-};
-
-Evaluator* MultiNetwork::makeEvaluator() const {
-  MultiCombinedEvaluator* multiCombinedEvaluator = new MultiCombinedEvaluator();
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    std::unique_ptr<Evaluator> evaluator(subNetworks_[i]->makeEvaluator());
-    multiCombinedEvaluator->addEvaluator(std::move(evaluator));
-  }
-  return multiCombinedEvaluator;
-}
-
-void MultiNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.h b/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
deleted file mode 100644
index afe15cb02..000000000
--- a/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GradientMachine.h"
-#include "NeuralNetwork.h"
-
-#include "paddle/legacy/utils/Locks.h"
-
-namespace paddle {
-
-class MultiNetwork : public NeuralNetwork {
- public:
-  explicit MultiNetwork(std::string subModelName = "")
-      : NeuralNetwork(subModelName) {}
-
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback,
-                    const std::vector<ParameterType>& parameterTypes,
-                    bool useGpu);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual void onPassEnd();
-
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-
-  const std::vector<std::unique_ptr<NeuralNetwork>>& getSubNetworks() const {
-    return subNetworks_;
-  }
-
-  virtual void start();
-
-  virtual void finish();
-
- protected:
-  std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
deleted file mode 100644
index 0f8048152..000000000
--- a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Util.h"
-
-#include "NeuralNetwork.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/CustomStackTrace.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
-#endif
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "MultiNetwork.h"
-#include "RecurrentGradientMachine.h"
-#include "paddle/legacy/gserver/layers/AgentLayer.h"
-#endif
-
-namespace paddle {
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams) {
-  // Create parameters values.
-  if (!para->useGpu() && sharedParams) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           (*sharedParams)[paramId]->getBuf(PARAMETER_VALUE),
-                           (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
-  } else {
-    if (para->isSparseRemoteUpdate()) {
-      para->enableType(PARAMETER_VALUE,
-                       FLAGS_loadsave_parameters_in_pserver
-                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-    } else {
-      para->enableType(PARAMETER_VALUE);
-    }
-  }
-  // Create parameter gradients.
-  if (para->isSparseRemoteUpdate() && !sharedParams) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
-  } else if (para->isGradSparseUpdate()) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_AUTO_GROW);
-  } else if (!para->isStatic()) {
-    para->enableType(PARAMETER_GRADIENT);
-  }
-}
-
-NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (config.type() == "recurrent_nn") {
-    return newNeuralNetwork("root");
-  } else if (config.type() == "multi_nn") {
-    return new MultiNetwork("root");
-  } else {
-    return newNeuralNetwork();
-  }
-#else
-  return new NeuralNetwork();
-#endif
-}
-
-std::map<std::string, bool> NeuralNetwork::dllInitMap;
-
-void NeuralNetwork::init(const ModelConfig& config,
-                         ParamInitCallback callback,
-                         const std::vector<ParameterType>& parameterTypes,
-                         bool useGpu) {
-  using std::placeholders::_1;
-  using std::placeholders::_2;
-  ParamInitCallback paramCallback = nullptr;
-  if (callback != nullptr) {
-    paramSelfInited_ = false;
-    paramCallback = callback;
-  } else {
-    paramSelfInited_ = true;
-    paramCallback = std::bind(parameterInitNN, _1, _2, nullptr);
-  }
-  config_ = config;
-
-  if (rootNetwork_ != nullptr) {
-    // direct use parameters_ and parameterMap_ from base network
-    CHECK_EQ((size_t)config.parameters_size(),
-             rootNetwork_->getParameters().size());
-    parameters_ = rootNetwork_->getParameters();
-    parameterMap_ = *(rootNetwork_->getParameterMap());
-  } else {
-    parameters_.reserve(config.parameters_size());
-    for (const auto& para_config : config.parameters()) {
-      auto parameter = std::make_shared<Parameter>(para_config,
-                                                   useGpu,
-                                                   /*initialize=*/false);
-      paramCallback(parameters_.size(), parameter.get());
-      if (!callback) {
-        for (ParameterType type :
-             (parameter->isStatic()
-                  ? std::vector<ParameterType>{PARAMETER_VALUE}
-                  : parameterTypes)) {
-          if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
-            parameter->enableType(type);
-          }
-        }
-      }
-      parameter->setID(parameters_.size());
-      parameters_.push_back(parameter);
-      CHECK(!parameterMap_.count(parameter->getName()));
-      parameterMap_[parameter->getName()] = parameter;
-    }
-  }
-
-  auto layerCreate = [&](const LayerConfig& layer_config) {
-    auto layer = Layer::create(layer_config);
-    CHECK(layer) << "Create layer failed. Layer name:" << layer->getName();
-    layers_.push_back(layer);
-    CHECK(!layerMap_.count(layer->getName()));
-    layerMap_[layer->getName()] = layer;
-  };
-
-  auto subModelConfig = std::find_if(config.sub_models().begin(),
-                                     config.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    layers_.reserve(subModelConfig->layer_names_size());
-    for (const auto& layer_name : subModelConfig->layer_names()) {
-      auto layer_config =
-          std::find_if(config.layers().begin(),
-                       config.layers().end(),
-                       [=](const LayerConfig& layer_config) {
-                         return layer_config.name() == layer_name;
-                       });
-      CHECK(layer_config != config.layers().end());
-      layerCreate(*layer_config);
-    }
-  } else {
-    layers_.reserve(config.layers_size());
-    for (const auto& layer_config : config.layers()) {
-      bool useLayer = true;
-      if (config.has_external_config()) {
-        useLayer = true;
-        for (const auto& name : config.external_config().layer_names()) {
-          if (layer_config.name() == name) {
-            useLayer = false;
-            break;
-          }
-        }
-      }
-      if (useLayer) {
-        layerCreate(layer_config);
-      }
-    }
-  }
-
-  for (const auto& layer : layers_) {
-    layer->init(layerMap_, parameterMap_);
-    layer->initSubNetwork(this /*root*/, config_, parameterTypes, useGpu);
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->input_layer_names()
-                    : config.input_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    dataLayers_.push_back(std::dynamic_pointer_cast<DataLayer>(it->second));
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->output_layer_names()
-                    : config.output_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    outputLayers_.push_back(it->second);
-  }
-
-  for (const auto& layer : layers_) {
-    const auto& name = layer->getName();
-    bool isMiddleLayer = true;
-
-    // if data layer
-    for (const auto& dataLayer : dataLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    // if output layer
-    for (const auto& dataLayer : outputLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    if (isMiddleLayer) {
-      middleLayers_.push_back(layer);
-    }
-  }
-}
-
-void NeuralNetwork::connect(LayerPtr agentLayer,
-                            LayerPtr realLayer,
-                            int height) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
-  CHECK_NOTNULL(agent);
-  agent->setRealLayer(realLayer, height);
-#endif
-}
-
-void NeuralNetwork::connect(std::string agentLayerName,
-                            NeuralNetwork* srcNN,
-                            std::string realLayerName) {
-  connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
-}
-
-void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        para->clearGradient();
-        if (mat) mat->clearIndices();
-      }
-    }
-  }
-
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    if (FLAGS_parallel_nn) {
-      const_cast<Argument&>(inArgs[i]).deviceId = -1;
-    }
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  for (auto& layer : layers_) {
-    layer->prefetch();
-  }
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        mat->setupIndices();
-        auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-            para->getMat(PARAMETER_GRADIENT).get());
-        matGrad->reserveStore();
-      }
-    }
-  }
-}
-
-void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                            std::vector<Argument>* outArgs,
-                            PassType passType) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-  outArgs->resize(outputLayers_.size());
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  gLayerStackTrace.set_stage(true);
-
-  {
-    for (auto& layer : layers_) {
-      REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
-      gLayerStackTrace.push(layer->getName());
-      layer->forward(passType);
-      gLayerStackTrace.pop(layer->getName());
-    }
-  }
-
-  outArgs->clear();
-  outArgs->reserve(outputLayers_.size());
-  for (auto& layer : outputLayers_) {
-    outArgs->push_back(layer->getOutput());
-  }
-}
-
-void NeuralNetwork::resetState() {
-  for (auto& layer : layers_) {
-    layer->resetState();
-  }
-}
-
-void NeuralNetwork::setState(const MachineState& machineState) {
-  for (size_t i = 0; i < layers_.size(); i++) {
-    if (machineState[i] != nullptr) {
-      layers_[i]->setState(machineState[i]);
-    }
-  }
-}
-
-void NeuralNetwork::getState(MachineState& machineState) {
-  machineState.clear();
-  machineState.reserve(layers_.size());
-  for (auto& layer : layers_) {
-    LayerStatePtr p = layer->getState();
-    machineState.push_back(p);
-  }
-}
-
-void NeuralNetwork::backward(const UpdateCallback& callback) {
-  gLayerStackTrace.set_stage(false);
-  FOR_EACH_R(layer, layers_) {
-    REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
-    gLayerStackTrace.push((*layer)->getName());
-    if ((*layer)->needGradient()) {
-      (*layer)->backward(callback);
-    }
-    gLayerStackTrace.pop((*layer)->getName());
-  }
-}
-
-void NeuralNetwork::finish() {
-#ifdef PADDLE_WITH_MKLDNN
-  FOR_EACH_R(layer, layers_) {
-    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
-    if (dnnLayer) {
-      dnnLayer->convertWeightsToPaddle();
-    }
-  }
-#endif
-}
-
-Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
-  return getLayer(layerName)->getOutput();
-}
-
-void NeuralNetwork::onPassEnd() {
-  for (auto& layer : layers_) {
-    layer->onPassEnd();
-  }
-}
-
-void NeuralNetwork::releaseOutput() {
-  for (auto& layer : middleLayers_) {
-    Argument& arg = layer->getOutput();
-    arg.value.reset();
-  }
-}
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-class CombinedEvaluator : public Evaluator {
- public:
-  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
-    evaluators_.emplace_back(std::move(evaluator));
-  }
-  void start() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->start();
-    }
-  }
-
-  void finish() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->finish();
-    }
-  }
-
-  void eval(const NeuralNetwork& nn) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->eval(nn);
-    }
-  }
-  real evalImp(std::vector<Argument>& arguments) override {
-    (void)arguments;
-    return -1;
-  }
-  void printStats(std::ostream& os) const override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->printStats(os);
-      os << ' ';
-    }
-  }
-
-  void distributeEval(ParameterClient2* client) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->distributeEval(client);
-    }
-  }
-
- protected:
-  std::vector<std::unique_ptr<Evaluator>> evaluators_;
-
-  // Evaluator interface
- public:
-  /**
-   * @brief getNames will return all inside evaluators' names.
-   * @param names [out]: return names.
-   */
-  void getNames(std::vector<std::string>* names) override {
-    for (auto& eval : evaluators_) {
-      eval->getNames(names);
-    }
-  }
-
-  /**
-   * @brief getValue could get all inside evaluators' value.
-   */
-  real getValue(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<real>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getValue(name, err);
-        });
-  }
-
-  /**
-   * @brief getType could get all inside evaluators' type.
-   */
-  std::string getType(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<std::string>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getType(name, err);
-        });
-  }
-
- private:
-  template <typename T>
-  T getMethodHelper(const std::string& name,
-                    Error* err,
-                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
-                        callback) const {
-    for (auto& eval : evaluators_) {
-      std::vector<std::string> names;
-      eval->getNames(&names);
-      if (std::find(names.begin(), names.end(), name) != names.end()) {
-        return callback(eval);
-      }
-    }
-    *err = Error("No such key %s", name.c_str());
-    return T();
-  }
-};
-
-class SubnetEvaluator : public CombinedEvaluator {
- public:
-  SubnetEvaluator(const std::string& layerName,
-                  std::unique_ptr<Evaluator>&& evaluator)
-      : layerName_(layerName) {
-    addEvaluator(std::move(evaluator));
-  }
-  void eval(const NeuralNetwork& nn) override {
-    const LayerPtr& layer = nn.getLayer(layerName_);
-    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
-                 << nn.getName();
-    bool accessed = false;
-    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
-      subnet.eval(evaluators_[0].get());
-      accessed = true;
-    });
-    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
-                    << " in submodel " << nn.getName();
-  }
-
- protected:
-  std::string layerName_;
-};
-
-Evaluator* NeuralNetwork::makeEvaluator() const {
-  CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
-  auto subModelConfig = std::find_if(config_.sub_models().begin(),
-                                     config_.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config_.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    // create the evaluators that belong to CURRENT submodel
-    for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
-      // find evaluator by name
-      auto thisEvalConfig = std::find_if(
-          config_.evaluators().begin(),
-          config_.evaluators().end(),
-          [=](const EvaluatorConfig& ecfg) {
-            return ecfg.name() == subModelConfig->evaluator_names(i);
-          });
-      bool validConfig = (thisEvalConfig != config_.evaluators().end());
-      if (validConfig) {
-        std::unique_ptr<Evaluator> evaluator(
-            Evaluator::create(*thisEvalConfig));
-        combinedEvaluator->addEvaluator(std::move(evaluator));
-      }
-    }
-    for (auto& layer : layers_) {
-      layer->accessSubNetwork(
-          [layer, combinedEvaluator](NeuralNetwork& subnet) {
-            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
-                layer->getName(),
-                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
-            combinedEvaluator->addEvaluator(std::move(subEvaluator));
-          });
-    }
-  } else {
-    for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
-      std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
-      combinedEvaluator->addEvaluator(std::move(evaluator));
-    }
-  }
-  return combinedEvaluator;
-}
-
-void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
-
-#endif
-
-void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
-  CHECK_GE(outputLayers_.size(), args.size());
-  for (size_t i = 0; i < args.size(); ++i) {
-    outputLayers_[i]->getOutput().grad = args[i].grad;
-  }
-}
-
-extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                             NeuralNetwork* network)
-    __attribute__((weak));
-
-NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
-                                               NeuralNetwork* rootNetwork) {
-  if (newCustomNerualNetwork) {
-    return newCustomNerualNetwork(name, rootNetwork);
-  } else {
-    return new NeuralNetwork(name, rootNetwork);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
deleted file mode 100644
index 566157c89..000000000
--- a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <map>
-#include <memory>
-
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/legacy/gserver/layers/CostLayer.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-
-namespace paddle {
-/*
- * @brief  Init function for the parameters.
- * @param paramId: the id of the parameter to init.
- * @param para: the pointer to the parameter to init.
- * @param sharedParams: the pointer to an array of the parameter to be shared.
- *                      If it is null, no parameter sharing is used.
- *                      Only CPU paramters can be shared.
- * It handles CPU, CPU sparse, CPU sparse remote,
- * and GPU parameters differently. If the type
- * of a parameter is NORMAL. Basically nothing need to be done.
- * CPU value: NORMAL.
- * CPU param: NORMAL.
- *
- * CPU sparse value: NORMAL.
- * CPU sparse gradient: MAT_SPARSE_ROW_AUTO_GROW.
- *
- * CPU sparse remote value: MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE).
- * CPU sparse remote gradient: MAT_SPARSE_ROW (!sharedParams)
- *                             MAT_SPARSE_ROW_AUTO_GROW (sharedParams)
- *
- * GPU value: NORMAL
- * GPU param: NORMAL
- */
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams);
-
-class NeuralNetwork : public GradientMachine {
- public:
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback = nullptr,
-                    const std::vector<ParameterType>& parameterTypes =
-                        std::vector<ParameterType>{PARAMETER_VALUE,
-                                                   PARAMETER_GRADIENT,
-                                                   PARAMETER_MOMENTUM},
-                    bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * Connect two submodels and
-   * down-submodel's output become up-submodel's input.
-   * By default, connection is one by one,
-   * If the agent height is smaller than real layer, *height* has to be filled.
-   *
-   * @param realLayer  The down-submodel's output layer.
-   * @param agentLayer The up-submodel's input agent layer.
-   */
-  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
-  void connect(std::string agentLayerName,
-               NeuralNetwork* srcNN,
-               std::string realLayerName);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  virtual Argument getLayerOutput(const std::string& layerName);
-
-  const LayerPtr& getLayer(const std::string& layerName) const {
-    auto it = layerMap_.find(layerName);
-    CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
-    return it->second;
-  }
-
-  virtual void onPassEnd();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-#endif
-
-  virtual void resetState();
-  virtual void setOutputGrad(const std::vector<Argument>& args);
-
-  /// set machine state
-  virtual void setState(const MachineState& machineState);
-
-  /// get machine state
-  virtual void getState(MachineState& machineState);
-
-  static NeuralNetwork* create(const ModelConfig& config);
-
-  ParameterMap* getParameterMap() { return &parameterMap_; }
-
-  /**
-   * @brief Access each layer as a for each loop.
-   * @param callback invoke with each layer.
-   */
-  template <typename T>
-  void forEachLayer(T callback) {
-    for (auto& l : layers_) {
-      if (callback(l)) {
-        break;
-      }
-    }
-  }
-
-  static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
-                                         NeuralNetwork* rootNetwork = nullptr);
-
-  const std::string& getName() const { return subModelName_; }
-
-  /// some finish work, like convert the weight format of MKLDNNLayers
-  void finish();
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  void releaseOutput();
-
- protected:
-  /**
-   * The constructor of NeuralNetwork.
-   * The sub networks can get parameters_ and parameterMap_
-   * from base NeuralNetwork.
-   *
-   * @param subModelName The name of sub-model.
-   * @param rootNetwork  It used in MultiNetwork.
-   */
-  NeuralNetwork(std::string subModelName = "",
-                NeuralNetwork* rootNetwork = nullptr)
-      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
-
-  std::string subModelName_;
-  ModelConfig config_;
-  std::vector<LayerPtr> layers_;
-  ParameterMap parameterMap_;
-  LayerMap layerMap_;
-
-  std::vector<DataLayerPtr> dataLayers_;
-  std::vector<LayerPtr> outputLayers_;
-  std::vector<LayerPtr> middleLayers_;
-
-  static std::map<std::string, bool> dllInitMap;
-
-  NeuralNetwork* rootNetwork_;
-
-  /// Whether parameter of this NN is initialized by its own
-  /// (i.e., not by callback supplied with the caller)
-  bool paramSelfInited_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
deleted file mode 100644
index 33d24b5b8..000000000
--- a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "ParallelNeuralNetwork.h"
-
-#include <pthread.h>
-#include <sched.h>
-
-namespace paddle {
-
-void ParallelNeuralNetwork::init(
-    const ModelConfig& config,
-    ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-
-  if (config.type() == "recurrent_nn") {
-    LOG(FATAL)
-        << "You can not add `--parallel_nn=true` on the command line, "
-        << "parallel_nn training mode does not support the recurrent_nn model.";
-  }
-
-  useGpu_ = useGpu;
-  numDevices_ = 0;
-  if (useGpu_) {
-    numDevices_ = hl_get_device_count();
-  }
-
-  for (auto& layer : layers_) {
-    int deviceId = layer->getDeviceId();
-    CHECK_LT(deviceId, numDevices_);
-    addComputeThread(deviceId);
-  }
-}
-
-void ParallelNeuralNetwork::addComputeThread(int deviceId) {
-  for (auto& thread : threads_) {
-    if (thread->getDeviceId() == deviceId) {
-      return;
-    }
-  }
-
-  threads_.emplace_back(new ParallelThread(
-      threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false));
-}
-
-void ParallelNeuralNetwork::waitAllThread() {
-  for (auto& thread : threads_) {
-    thread->jobEnqueue(NULL, TASK_END_LAYER);
-  }
-
-  for (size_t i = 0; i < threads_.size(); i++) {
-    threads_[i]->queue_.waitEmpty();
-  }
-}
-
-void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId,
-                                               LayerPtr layer,
-                                               TaskType task) {
-  for (auto& thread : threads_) {
-    if (thread->getDeviceId() == deviceId) {
-      thread->jobEnqueue(layer, task);
-      return;
-    }
-  }
-  LOG(FATAL) << "No specific device thread ";
-}
-
-void ParallelNeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                                    std::vector<Argument>* outArgs,
-                                    PassType passType) {
-  for (auto& thread : threads_) {
-    thread->setForwardPassType(passType);
-  }
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-  outArgs->resize(outputLayers_.size());
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    const_cast<Argument&>(inArgs[i]).deviceId = -1;
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  for (auto& layer : layers_) {
-    dispatchByDeviceId(layer->getDeviceId(), layer, TASK_FORWARD);
-  }
-
-  {
-    REGISTER_TIMER("forwardTime");
-    waitAllThread();
-  }
-  outArgs->clear();
-  outArgs->reserve(outputLayers_.size());
-  for (auto& layer : outputLayers_) {
-    outArgs->push_back(layer->getOutput());
-  }
-}
-
-void ParallelNeuralNetwork::backward(const UpdateCallback& callback) {
-  for (auto& thread : threads_) {
-    thread->setBackwardCallback(callback);
-  }
-
-  FOR_EACH_R(layer, layers_) {
-    dispatchByDeviceId((*layer)->getDeviceId(), *layer, TASK_BACKWARD);
-  }
-  {
-    REGISTER_TIMER("backwardTime");
-    waitAllThread();
-  }
-}
-
-void ParallelNeuralNetwork::forwardBackward(const std::vector<Argument>& inArgs,
-                                            std::vector<Argument>* outArgs,
-                                            PassType passType,
-                                            const UpdateCallback& callback) {
-  forward(inArgs, outArgs, passType);
-  backward(callback);
-}
-
-void ParallelNeuralNetwork::start() {
-  for (auto& thread : threads_) {
-    thread->start();
-  }
-}
-
-ParallelThread::ParallelThread(int threadId, int deviceId, bool useGpu)
-    : threadId_(threadId), deviceId_(deviceId), useGpu_(useGpu) {}
-
-ParallelThread::~ParallelThread() { stop(); }
-
-void ParallelThread::stop() {
-  if (computeThread_) {
-    jobEnqueue(NULL, TASK_THREAD_FINISH);
-    computeThread_->join();
-    computeThread_.reset(nullptr);
-  }
-}
-
-void ParallelThread::computeThread() {
-  LOG(INFO) << "gradComputeThread " << threadId_;
-
-  if (useGpu_) {
-    hl_init(deviceId_);
-  }
-
-  while (true) {
-    struct Job job_work = queue_.dequeue();
-
-    if (job_work.task_ == TASK_END_LAYER) {
-      continue;
-    } else if (job_work.task_ == TASK_THREAD_FINISH) {
-      break;
-    }
-
-    if (TASK_FORWARD == job_work.task_) {
-      {
-        REGISTER_TIMER_INFO("waitInputValue",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->waitInputValue();
-      }
-      {
-        REGISTER_TIMER_INFO("threadForwardTimer",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->forward(passType_);
-      }
-      {
-        REGISTER_TIMER_INFO("copyOutputToOtherDevice",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->copyOutputToOtherDevice();
-      }
-    } else {
-      {
-        REGISTER_TIMER_INFO("waitAndMergeOutputGrad",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->waitAndMergeOutputGrad();
-      }
-      {
-        REGISTER_TIMER_INFO("threadBackwardTimer",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->backward(backwardCallback_);
-      }
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      job_work.layer_->markAllInputGrad();
-    }
-  }
-  hl_fini();
-}
-
-void ParallelThread::start() {
-  computeThread_.reset(new std::thread([this]() { computeThread(); }));
-}
-
-void ParallelThread::jobEnqueue(LayerPtr layer, TaskType task) {
-  struct Job job_work;
-  job_work.layer_ = layer;
-  job_work.task_ = task;
-  queue_.enqueue(job_work);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
deleted file mode 100644
index c09145950..000000000
--- a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "NeuralNetwork.h"
-
-namespace paddle {
-
-class ParallelThread;
-
-enum TaskType {
-  TASK_FORWARD = 0,
-  TASK_BACKWARD = 1,
-  TASK_END_LAYER = 2,
-  TASK_THREAD_FINISH = 3,
-};
-
-/**
- * A ParallelNeuralNetwork is capable of calculating a neural network through
- * multiple threads in parallel.
- */
-class ParallelNeuralNetwork : public NeuralNetwork {
- public:
-  ParallelNeuralNetwork(std::string subModelName = "",
-                        NeuralNetwork *rootNetwork = nullptr)
-      : NeuralNetwork(subModelName, rootNetwork) {}
-
-  virtual void init(const ModelConfig &config,
-                    ParamInitCallback callback = nullptr,
-                    const std::vector<ParameterType> &parameterTypes =
-                        std::vector<ParameterType>{PARAMETER_VALUE,
-                                                   PARAMETER_GRADIENT,
-                                                   PARAMETER_MOMENTUM},
-                    bool useGpu = FLAGS_use_gpu);
-
-  virtual void forward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback &callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs,
-                       PassType passType,
-                       const UpdateCallback &callback = NULL);
-
-  virtual void start();
-
-  void addComputeThread(int deviceId);
-
-  void dispatchByDeviceId(int deviceId, LayerPtr layer, TaskType task);
-
-  void waitAllThread();
-
-  // virtual void eval(Evaluator* evaluator);
-
- protected:
-  bool useGpu_;
-  /// number of gpu devices
-  int numDevices_;
-  std::vector<std::unique_ptr<ParallelThread>> threads_;
-};
-
-class ParallelThread {
- public:
-  ParallelThread(int threadId, int deviceId, bool useGpu);
-  ~ParallelThread();
-  void jobEnqueue(LayerPtr layer, TaskType task);
-  void start();
-  void stop();
-  int getDeviceId() const { return deviceId_; }
-
-  void setBackwardCallback(const UpdateCallback &callback) {
-    backwardCallback_ = callback;
-  }
-  void setForwardPassType(PassType passType) { passType_ = passType; }
-
- protected:
-  void computeThread();
-
- public:
-  struct Job {
-    LayerPtr layer_;
-    TaskType task_;
-  };
-  typedef Queue<Job> JobQueue;
-  JobQueue queue_;
-
- protected:
-  /// from 0 to threads-1
-  int threadId_;
-  /// the GPU device Id which the computeThread_ used
-  int deviceId_;
-  bool useGpu_;
-  std::unique_ptr<std::thread> computeThread_;
-  /// whether the thread should stop
-  bool stopping_;
-  UpdateCallback backwardCallback_;
-  PassType passType_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
deleted file mode 100644
index e49f04240..000000000
--- a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,1501 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RecurrentGradientMachine.h"
-#include <dlfcn.h>
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <limits>
-#include "NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/AgentLayer.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
-
-static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
-static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
-static const char* DIY_FINISH_CALC_PROB_SYMBOL_NAME = "finish_calc_prob";
-
-namespace paddle {
-
-/**
- * Start Custom Calculate Probability callback type.
- *
- * @param nNode, nodes: the path will be explored. nNodes is array size.
- *                      nodes is array elements.
- *
- * @return: A custom handler id that will passed to another callback.
- */
-typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
-
-/**
- * Doing Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- * @param nNode, nodes: Array. The current path.
- * @param curProb: The current log probability that neural network returns.
- *
- * @return: Log probability which user calculated, it will be updated to this
- *          path.
- * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
- */
-typedef real (*DiyCalcProbCallback)(
-    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
-
-/**
- * Finish Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- */
-typedef void (*DiyStopCalcProbCallback)(int handler);
-
-static DiyCalcProbCallback gDiyProbMethod = nullptr;
-static DiyStartCalcProbCallback gDiyProbStart = nullptr;
-static DiyStopCalcProbCallback gDiyProbStop = nullptr;
-static void* gDiyProbHandle = nullptr;
-
-static void exit_diy_prob() { dlclose(gDiyProbHandle); }
-
-template <typename SymbolType>
-static inline SymbolType loadDiySymbol(const char* symbolName) {
-  void* sym = dlsym(gDiyProbHandle, symbolName);
-  CHECK(sym) << "Cannot load symbol " << symbolName << " from "
-             << FLAGS_diy_beam_search_prob_so;
-  return reinterpret_cast<SymbolType>(sym);
-}
-
-static InitFunction __init__diy_prob_method(
-    [] {
-      std::string soName = FLAGS_diy_beam_search_prob_so;
-      if (!soName.empty()) {
-        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
-        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
-        atexit(exit_diy_prob);
-        gDiyProbMethod =
-            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
-            DIY_START_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
-            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
-      }
-    },
-    std::numeric_limits<int>::max());
-
-class BeamSearchControlCallbacks {
- public:
-  RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
-      beamSearchCandidateAdjust;
-  RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
-  RecurrentGradientMachine::DropCallback stopDetermineCandidates;
-
-  //! for gcc46 aggregate initialization is not very well, so we need to
-  //! explicit
-  BeamSearchControlCallbacks(
-      const RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback&
-          candidateAdjust,
-      const RecurrentGradientMachine::NormOrDropNodeCallback& norm,
-      const RecurrentGradientMachine::DropCallback& stop)
-      : beamSearchCandidateAdjust(candidateAdjust),
-        normOrDropNode(norm),
-        stopDetermineCandidates(stop) {}
-};
-
-class BeamSearchStatisticsCallbacks {
- public:
-  RecurrentGradientMachine::EachStepCallback onEachStepStarted;
-  RecurrentGradientMachine::EachStepCallback onEachStepStoped;
-
-  BeamSearchStatisticsCallbacks(
-      const RecurrentGradientMachine::EachStepCallback& start,
-      const RecurrentGradientMachine::EachStepCallback& stop)
-      : onEachStepStarted(start), onEachStepStoped(stop) {}
-};
-
-RecurrentGradientMachine::RecurrentGradientMachine(
-    const std::string& subModelName, NeuralNetwork* rootNetwork)
-    : NeuralNetwork(subModelName),
-      rootNetwork_(rootNetwork),
-      beamSearchCtrlCallbacks_(nullptr),
-      beamSearchStatistics_(nullptr) {
-  CHECK(!subModelName_.empty());
-}
-
-/**
- * bias layer, as input of memory frame 0 will give vector of zeros
- * if bias parameter is not set.
- *
- * boot bias layer create directly in recurrent gradient machine, because:
- *
- * 1. It is only one frame, so it should not be placed in layer group,
- *    which is one instance for every one frame.
- *
- * 2. It is no input layer, so it need resetHeight() before forward(),
- *    and resetHeight() must be called in recurrent gradient machine,
- *    so it's should not be placed in root network.
- */
-class BootBiasLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  IVectorPtr cpuIds_;
-
- public:
-  explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    if (!Layer::init(layerMap, parameterMap)) return false;
-
-    if (biasParameter_) {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-    }
-    return true;
-  }
-
-  void resetHeight(int height) {
-    if (config_.has_bos_id()) {  // used as a constant id layerConfig
-      IVector::resizeOrCreate(output_.ids, height, useGpu_);
-      output_.ids->reset((int)config_.bos_id());
-    } else {
-      resetOutput(height, getSize());
-    }
-  }
-
-  void forward(PassType passType) override {
-    if (biases_) {
-      MatrixPtr outV = getOutputValue();
-      outV->addBias(*(biases_->getW()), 1);
-      forwardActivation();
-    }
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    if (biases_ && biases_->getWGrad()) {
-      backwardActivation();
-      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-      biases_->getParameterPtr()->incUpdate(callback);
-    }
-  }
-};
-
-void RecurrentGradientMachine::init(
-    const ModelConfig& config,
-    ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-  useGpu_ = useGpu;
-
-  auto subModelConfig =
-      std::find_if(config.sub_models().begin(),
-                   config.sub_models().end(),
-                   [this](const SubModelConfig& sub_model) {
-                     return sub_model.name() == this->subModelName_;
-                   });
-  CHECK(subModelConfig != config.sub_models().end());
-  reversed_ = subModelConfig->reversed();
-  generating_ = subModelConfig->has_generator();
-
-  inFrameLines_.resize(subModelConfig->in_links_size());
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
-    inFrameLines_[i].inLayer =
-        rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
-  }
-
-  outFrameLines_.resize(subModelConfig->out_links_size());
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    auto& linkPair = subModelConfig->out_links(i);
-    outFrameLines_[i].layerName = linkPair.layer_name();
-    outFrameLines_[i].agentLayer = rootNetwork_->getLayer(linkPair.link_name());
-  }
-
-  memoryFrameLines_.resize(subModelConfig->memories_size());
-  for (size_t i = 0; i < memoryFrameLines_.size(); ++i) {
-    auto& memoryConfig = subModelConfig->memories(i);
-    memoryFrameLines_[i].layerName = memoryConfig.layer_name();
-    memoryFrameLines_[i].linkName = memoryConfig.link_name();
-    auto agentConfig =
-        std::find_if(config.layers().begin(),
-                     config.layers().end(),
-                     [&memoryConfig](const LayerConfig& layerConfig) {
-                       return layerConfig.name() == memoryConfig.link_name();
-                     });
-    CHECK(agentConfig != config.layers().end());
-    if (memoryConfig.has_boot_layer_name()) {
-      memoryFrameLines_[i].rootLayer =
-          rootNetwork_->getLayer(memoryConfig.boot_layer_name());
-
-      LayerConfig scatterConfig = *agentConfig;
-      memoryFrameLines_[i].rootAgent.reset(
-          new ScatterAgentLayer(scatterConfig));
-      memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
-    } else {
-      LayerConfig biasConfig = *agentConfig;
-      if (memoryConfig.has_boot_bias_parameter_name()) {
-        biasConfig.set_bias_parameter_name(
-            memoryConfig.boot_bias_parameter_name());
-        biasConfig.set_active_type(memoryConfig.boot_bias_active_type());
-      } else if (memoryConfig.has_boot_with_const_id()) {
-        biasConfig.set_bos_id(memoryConfig.boot_with_const_id());
-      }
-      memoryFrameLines_[i].biasLayer.reset(new BootBiasLayer(biasConfig));
-      memoryFrameLines_[i].biasLayer->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].biasLayer;
-    }
-
-    if (subModelConfig->has_generator()) {
-      memoryFrameLines_[i].scatterAgents.resize(2);
-      for (auto& agent : memoryFrameLines_[i].scatterAgents) {
-        agent.reset(new ScatterAgentLayer(*agentConfig));
-        agent->init(LayerMap(), parameterMap_);
-      }
-    }
-  }
-
-  if (subModelConfig->has_generator()) {
-    generator_.config = subModelConfig->generator();
-    eosFrameLine_.reset(new EosFrameLine);
-    maxSequenceLength_ = generator_.config.max_num_frames();
-  }
-
-  // get parameters actually used by this Layer Group
-  resizeOrCreateFrames(1);
-  for (auto& para : frames_[0]->getParameters()) {
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-  for (auto& para : parameters_) {  // bias layer parameters
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-}
-
-void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
-  if ((size_t)numFrames <= frames_.size()) {
-    return;
-  }
-
-  frames_.reserve(numFrames);
-  for (auto& inFrameLine : inFrameLines_) {
-    inFrameLine.agents.reserve(numFrames);
-  }
-  for (auto& outFrameLine : outFrameLines_) {
-    outFrameLine.frames.reserve(numFrames);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.frames.reserve(numFrames);
-    memoryFrameLine.agents.reserve(numFrames);
-  }
-  if (eosFrameLine_) {
-    eosFrameLine_->layers.reserve(numFrames);
-  }
-
-  ParamInitCallback subParamInitCb = [this](int paramId, Parameter* para) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           this->parameters_[paramId]->getBuf(PARAMETER_VALUE),
-                           this->parameters_[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        this->parameters_[paramId]->getBuf(PARAMETER_GRADIENT),
-        this->parameters_[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-
-  for (int i = frames_.size(); i < numFrames; ++i) {
-    std::unique_ptr<NeuralNetwork> frame(
-        NeuralNetwork::newNeuralNetwork(subModelName_));
-    frame->init(config_, subParamInitCb);
-
-    for (auto& inFrameLine : inFrameLines_) {
-      inFrameLine.agents.push_back(frame->getLayer(inFrameLine.linkName));
-    }
-
-    for (auto& outFrameLine : outFrameLines_) {
-      outFrameLine.frames.push_back(frame->getLayer(outFrameLine.layerName));
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      memoryFrameLine.frames.push_back(
-          frame->getLayer(memoryFrameLine.layerName));
-      memoryFrameLine.agents.push_back(
-          frame->getLayer(memoryFrameLine.linkName));
-    }
-    if (eosFrameLine_) {
-      eosFrameLine_->layers.push_back(
-          frame->getLayer(generator_.config.eos_layer_name()));
-    }
-
-    frames_.emplace_back(std::move(frame));
-  }
-}
-
-void RecurrentGradientMachine::resizeBootFrame(int numSequences) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.biasLayer) {
-      auto biasLayer =
-          dynamic_cast<BootBiasLayer*>(memoryFrameLine.biasLayer.get());
-      CHECK_NOTNULL(biasLayer);
-      biasLayer->resetHeight(numSequences);
-    } else {  // check input root layer height
-      CHECK_EQ(numSequences,
-               memoryFrameLine.rootLayer->getOutput().getNumSequences());
-    }
-  }
-}
-
-void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::checkInputConsistency(
-    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
-  if (commonSeqInfo_.empty()) {
-    commonSeqInfo_.resize(seqInfo.size());
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
-      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
-    }
-  } else {
-    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
-        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-        << " has mismatched number of sequences";
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-    }
-  }
-}
-
-void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
-  int numSequences = commonSeqInfo_.size();
-  numSeqs_.resize(maxSequenceLength_);
-  for (int i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
-      numSeqs_[j] = i + 1;
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeInput(PassType passType) {
-  info_.clear();
-  info_.resize(inFrameLines_.size());
-
-  commonSeqInfo_.clear();
-  seqInfos_.clear();
-  seqInfos_.resize(inFrameLines_.size());
-
-  for (size_t i = 0; i < inFrameLines_.size(); i++) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      continue;
-    }
-    input.getSeqInfo(&seqInfos_[i]);
-    checkInputConsistency(i, seqInfos_[i]);
-  }
-  CHECK(!commonSeqInfo_.empty())
-      << "At least one input needs to be sequence or subsequence";
-  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
-
-  calcNumSequencesAtEachStep();
-
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      seqInfos_[i] = commonSeqInfo_;
-    }
-    createInFrameInfo(i, input, passType);
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    // inFrameLine select rows in real layer one time
-    for (size_t i = 0; i < inFrameLines_.size(); i++) {
-      selectRowsOneTime(inFrameLines_[i].inLayer,
-                        info_[i].allIds,
-                        &(inFrameLines_[i].outArg),
-                        passType);
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
-  calcSequenceStartPositions();
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    Info info;
-    auto& outFrameLine = outFrameLines_[i];
-    ICpuGpuVectorPtr sequenceStartPositions;
-    ICpuGpuVectorPtr subSequenceStartPositions;
-    createOutFrameInfo(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
-                                       subSequenceStartPositions,
-                                       info.allIds,
-                                       info.idIndex);
-  }
-}
-
-void RecurrentGradientMachine::connectFrames(PassType passType) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      createMemoryFrameInfo(&memoryFrameLine, passType);
-      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
-                                          memoryFrameLine.outArg,
-                                          memoryFrameLine.allIds,
-                                          /* idIndex */ 0,
-                                          memoryFrameLine.allIds->getSize(),
-                                          /* handleBackward */ true);
-      if (memoryFrameLine.sequenceStartPositions) {
-        int size = memoryFrameLine.sequenceStartPositions->getSize();
-        scatterAgent->setSequenceStartPositions(
-            memoryFrameLine.sequenceStartPositions,
-            /* seqStartPosIndex */ 0,
-            size);
-      }
-    }
-  }
-
-  for (auto& outFrameLine : outFrameLines_) {
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    gatherAgent->clearRealLayers();
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    // connect in_links
-    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
-      Info& info = info_[j];
-      // idSize denotes the sum number of tokens in each length i
-      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
-      int idSize = info.idIndex.empty() ? numSeqs_[i]
-                                        : info.idIndex[i + 1] - info.idIndex[i];
-      InFrameLine inFrameLine = inFrameLines_[j];
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
-      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg,
-                                          info.allIds,
-                                          idIndex,
-                                          idSize,
-                                          i == 0);
-      if (info.sequenceStartPositions) {
-        // size: the length of subsequence
-        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(
-            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
-      }
-    }
-
-    // connect out_links
-    for (auto& outFrameLine : outFrameLines_) {
-      auto gatherAgent =
-          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-      gatherAgent->addRealLayer(outFrameLine.frames[i]);
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      NeuralNetwork::connect(
-          memoryFrameLine.agents[i],
-          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          numSeqs_[i] /*height of agent*/);
-    }
-  }
-}
-
-void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                       std::vector<Argument>* outArgs,
-                                       PassType passType) {
-  /* inArgs and outArgs are not used.
-     The inputs are inFrameLines_[i].inLayer.
-     The outputs are outFramesLines_[i].agentLayer
-   */
-
-  if (generating_) {
-    generateSequence();
-    return;
-  }  // else forward..
-
-  reorganizeInput(passType);
-  int numSequences = commonSeqInfo_.size();
-
-  resizeOrCreateFrames(maxSequenceLength_);
-  resizeBootFrame(numSequences);
-
-  connectFrames(passType);
-
-  REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
-  // forward
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(passType);
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[i]->forward(inArgs, &outArgs, passType);
-  }
-
-  reorganizeOutput(passType);
-}
-
-void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
-  if (generating_) {
-    return;
-  }
-  REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
-  AsyncGpuBlock asyncGpuBlock;
-  for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
-    frames_[i]->backward(nullptr);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->backward(nullptr);
-  }
-}
-
-void RecurrentGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
-  // call printers frame by frame
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
-    evaluator->eval(*(frames_[i].get()));
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchControlCallbacks(
-    const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
-    const NormOrDropNodeCallback& normOrDropNode,
-    const DropCallback& stopBeamSearch) {
-  this->removeBeamSearchControlCallbacks();
-  //! for gcc 46, aggregate initialization is not supported. TAT
-  this->beamSearchCtrlCallbacks_ = new BeamSearchControlCallbacks(
-      adjustBeamSearch, normOrDropNode, stopBeamSearch);
-}
-
-void RecurrentGradientMachine::removeBeamSearchControlCallbacks() {
-  if (this->beamSearchCtrlCallbacks_) {
-    delete this->beamSearchCtrlCallbacks_;
-    this->beamSearchCtrlCallbacks_ = nullptr;
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchStatisticsCallbacks(
-    const EachStepCallback& onEachStepStarted,
-    const EachStepCallback& onEachStepStoped) {
-  this->removeBeamSearchStatisticsCallbacks();
-  this->beamSearchStatistics_ =
-      new BeamSearchStatisticsCallbacks(onEachStepStarted, onEachStepStoped);
-}
-
-void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
-  if (this->beamSearchStatistics_) {
-    delete this->beamSearchStatistics_;
-    this->beamSearchStatistics_ = nullptr;
-  }
-}
-
-namespace {
-void lenToStarts(std::vector<int>& starts) {
-  int pos = 0;
-  starts.back() = 0;
-  for (auto& start : starts) {
-    int tmp = start;
-    start = pos;
-    pos += tmp;
-  }
-  starts.back() = pos;
-}
-}  // namespace
-
-void RecurrentGradientMachine::calcSequenceStartPositions() {
-  std::vector<int> starts(commonSeqInfo_.size() + 1);
-  for (auto& seqInfo : commonSeqInfo_) {
-    starts[seqInfo.seqId] = seqInfo.topLevelLength;
-  }
-  lenToStarts(starts);
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
-  std::copy(starts.begin(),
-            starts.end(),
-            sequenceStartPositions_->getMutableData(false));
-}
-
-void RecurrentGradientMachine::checkOutputConsistency(
-    OutFrameLine& outFrameLine) {
-  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
-    int numSequences = frame->getOutput().getNumSequences();
-    CHECK_EQ(numSeqs_[i], numSequences);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  checkOutputConsistency(outFrameLine);
-
-  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
-    createOutFrameInfo_seq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  } else {
-    createOutFrameInfo_subseq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_seq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int seqStart = starts[commonSeqInfo_[j].seqId];
-      int seqLength = commonSeqInfo_[j].topLevelLength;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-  sequenceStartPositions = sequenceStartPositions_;
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_subseq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  size_t numSequences = commonSeqInfo_.size();
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-  std::vector<int> subStarts(starts[numSequences] + 1);
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    const int* seqStarts =
-        frame->getOutput().sequenceStartPositions->getData(false);
-    for (size_t j = 0; j < numSequences; ++j) {
-      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
-          seqStarts[j + 1] - seqStarts[j];
-    }
-  }
-  lenToStarts(subStarts);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int pos = starts[commonSeqInfo_[j].seqId] + i;
-      int subSeqStart = subStarts[pos];
-      int subSeqEnd = subStarts[pos + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-
-  ICpuGpuVector::resizeOrCreate(
-      subSequenceStartPositions, subStarts.size(), false);
-  int* cpuSubSequenceStartPositions =
-      subSequenceStartPositions->getMutableData(false);
-  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* cpuSequenceStartPositions =
-      sequenceStartPositions->getMutableData(false);
-  for (size_t i = 0; i <= numSequences; ++i) {
-    cpuSequenceStartPositions[i] = subStarts[starts[i]];
-  }
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* create scattered id infomation for all realLayer of inFrameLines one time.
- * If hasSubseq, will also create scattered sequenceStartPositions infomation
- * for all realLayer of inFrameLines one time.
- */
-void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
-                                                 const Argument& input,
-                                                 PassType passType) {
-  if (!input.hasSeq()) {
-    createInFrameInfo_nonseq(inlinkId, input, passType);
-  } else if (!input.hasSubseq()) {
-    createInFrameInfo_seq(inlinkId, input, passType);
-  } else {
-    createInFrameInfo_subseq(inlinkId, input, passType);
-  }
-}
-
-void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.clear();
-  for (size_t i = 0; i < seqInfo.size(); ++i) {
-    allIds.push_back(seqInfo[i].seqId);
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-}
-
-void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
-                                                     const Argument& input,
-                                                     PassType passType) {
-  std::vector<int> allIds;
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int seqLength = seqInfo[j].topLevelLength;
-      int seqStart = seqInfo[j].seqStart;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-  }
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-  std::vector<int> sequenceStartPositions;
-  const int* subSequenceStartPositions = nullptr;
-
-  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
-  inlinkInfo->seqStartPosIndex.clear();
-  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    sequenceStartPositions.push_back(0);  // first element = 0
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
-      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       subSeqEnd - subSeqStart);
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
-  }
-  // inFrameLine create sequenceStartPositions one time
-  CHECK_EQ(
-      sequenceStartPositions.size(),
-      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
-  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
-void RecurrentGradientMachine::createMemoryFrameInfo(
-    MemoryFrameLine* memoryFrameLine, PassType passType) {
-  const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
-  size_t numSequences = input.getNumSequences();
-  std::vector<int> allIds;
-  bool seqFlag = input.hasSeq();
-  CHECK(!input.hasSubseq())
-      << "Subsequence boot layer for memory is not supported";
-
-  if (seqFlag) {  // for sequenceScatterAgentLayer
-    std::vector<int> sequenceStartPositions;
-    sequenceStartPositions.push_back(0);  // first element = 0
-    const int* starts = input.sequenceStartPositions->getData(false);
-    for (size_t i = 0; i < numSequences; ++i) {
-      // memory info adopt info of inlinks[0]
-      int seqId = seqInfos_[0][i].seqId;
-      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       starts[seqId + 1] - starts[seqId]);
-    }
-    createSeqPos(sequenceStartPositions,
-                 &(*memoryFrameLine).sequenceStartPositions);
-
-  } else {  // for scatterAgentLayer
-    for (size_t i = 0; i < numSequences; ++i) {
-      allIds.push_back(seqInfos_[0][i].seqId);
-    }
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
-  // memoryFrameLine select rows in real layer one time
-  selectRowsOneTime((*memoryFrameLine).rootLayer,
-                    (*memoryFrameLine).allIds,
-                    &(*memoryFrameLine).outArg,
-                    passType);
-}
-
-void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
-                                             IVectorPtr* dstIds,
-                                             int size) {
-  int idSize = srcIds.size();
-  CHECK_EQ(idSize, size);
-  IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
-  (*dstIds)->copyFrom(srcIds.data(), idSize);
-  // check
-  std::sort(srcIds.begin(), srcIds.end());
-  for (int i = 0; i < idSize; ++i) {
-    CHECK_EQ(srcIds[i], i);
-  }
-}
-
-void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
-                                                 const IVectorPtr& allIds,
-                                                 Argument* arg,
-                                                 PassType passType) {
-  Argument& src = layer->getOutput();
-  if (src.value) {
-    const MatrixPtr& realV = src.value;
-    int height = realV->getHeight();
-    int width = realV->getWidth();
-    Matrix::resizeOrCreate(
-        arg->value, height, width, /* trans */ false, useGpu_);
-    arg->value->zeroMem();
-    arg->value->selectRows(*realV, *allIds);
-    if (passType != PASS_TEST) {
-      Matrix::resizeOrCreate(
-          arg->grad, height, width, /* trans */ false, useGpu_);
-      arg->grad->zeroMem();
-    }
-  }
-  if (src.ids) {
-    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
-    arg->ids->selectFrom(*src.ids, *allIds);
-  }
-}
-
-void RecurrentGradientMachine::createSeqPos(
-    const std::vector<int>& sequenceStartPosition,
-    ICpuGpuVectorPtr* sequenceStartPositions) {
-  int size = sequenceStartPosition.size();
-  const int* data = sequenceStartPosition.data();
-  ICpuGpuVector::resizeOrCreate(*sequenceStartPositions, size, false);
-  (*sequenceStartPositions)->copyFrom(data, size, false);
-}
-
-size_t RecurrentGradientMachine::getGenBatchSize() {
-  size_t numSequences = 0;
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (!memoryFrameLine.rootLayer) continue;
-    Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
-    size_t batchSize = bootArg.getNumSequences();
-    if (numSequences) {
-      CHECK_EQ(numSequences, batchSize);
-    } else {
-      numSequences = batchSize;
-    }
-  }
-  CHECK(numSequences)
-      << "Fail to get batch size in generation. "
-         "At least one of the Memory layer MUST have a layer that is NOT in "
-         "the layer group to boot it, and this boot layer is used to "
-         "decide batch_size in generation process.";
-  return numSequences;
-}
-
-void RecurrentGradientMachine::generateSequence() {
-  CHECK_NOTNULL(eosFrameLine_.get());
-  CHECK_GE(outFrameLines_.size(), 1UL);
-  size_t numSequences = getGenBatchSize();
-
-  resizeBootFrame(numSequences);
-  // We create only two sub-network in generation, one stores states of all
-  // layers in previous time step and the other storing the states at current
-  // time step.
-  resizeOrCreateFrames(2);
-
-  // outFrameLines_.size() > 1UL
-  dataArgsSize_ = outFrameLines_.size() - 1;
-  dataArgs_.resize(dataArgsSize_);
-  dataArgsFrame_.clear();
-  dataArgsFrame_.resize(dataArgsSize_);
-
-  // connect boot frame memory links
-  std::vector<int> ids(numSequences);
-  for (size_t i = 0; i < numSequences; ++i) {
-    ids[i] = i;
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
-    }
-    NeuralNetwork::connect(
-        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
-  }
-
-  // boot layer forward
-  AsyncGpuBlock asyncGpuBlock;
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(PASS_TEST);
-  }
-
-  // init outArg
-  size_t resultNum = generator_.config.num_results_per_sample();
-  size_t maxGenWordCount =
-      generator_.config.max_num_frames() * numSequences * resultNum;
-  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
-  if (resultNum > 1) {
-    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
-    Matrix::resizeOrCreate(generator_.outArg.in,
-                           /* height */ numSequences,
-                           /* width */ resultNum,
-                           false,
-                           /* useGpu */ false);
-  }
-  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
-                                numSequences + 1,
-                                /* useGpu */ false);
-  if (getBeamSize() > 1) {
-    beamSearch(numSequences);
-  } else {
-    oneWaySearch(numSequences);
-  }
-  if (dataArgsSize_) createDataOutlink();
-
-  size_t size = generator_.ids.size();
-  generator_.outArg.ids->resize(size);
-  generator_.outArg.ids->copyFrom(generator_.ids.data(), size);
-
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-  auto dataAgent = dynamic_cast<DataLayer*>(outFrameLine.agentLayer.get());
-  CHECK_NOTNULL(dataAgent);
-  dataAgent->setData(generator_.outArg);
-  dataAgent->prefetch();
-}
-
-void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-
-  // finalPaths_[0] stores the generated results of the
-  // entire batch, so its size exactly equals to batchSize.
-  finalPaths_.clear();
-  finalPaths_.resize(1);
-  std::vector<Path>& finalPaths = finalPaths_[0];
-  finalPaths.resize(batchSize);
-
-  seqIds_.resize(batchSize);
-  std::vector<int> scatterIds;
-  for (size_t i = 0; i < batchSize; ++i) {
-    finalPaths[i].seqId = i;
-    seqIds_[i] = i;
-  }
-
-  // forward
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    if (i && scatterIds.empty()) break;
-    int machineCur = i % 2;
-    int machinePrev = (i - 1) % 2;
-    // connect memory links
-    if (i) {
-      seqIds_.clear();
-      for (size_t j = 0; j < batchSize; ++j) {
-        if (finalPaths[j].seqId != -1) seqIds_.push_back(j);
-      }
-
-      for (auto& memoryFrameLine : memoryFrameLines_) {
-        auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-            memoryFrameLine.scatterAgents[machineCur].get());
-        scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds);
-        scatterAgent->forward(PASS_TEST);
-        NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                               memoryFrameLine.scatterAgents[machineCur]);
-      }
-    }
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-    const IVectorPtr& idVec = outFrameLine.frames[machineCur]->getOutput().ids;
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      finalPaths[seqIds_[j]].ids.push_back(idVec->getElement(j));
-      finalPaths[seqIds_[j]].machineIdVec.push_back(j);
-    }
-
-    copyDataOutlinkFrame(machineCur);
-
-    // check eos
-    const IVectorPtr& eosVec =
-        eosFrameLine_->layers[machineCur]->getOutput().ids;
-    scatterIds.clear();
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      if (eosVec->getElement(j) == 1U) {
-        // path.seqId = -1 indicates end of generation
-        // of an input sequence
-        finalPaths[seqIds_[j]].seqId = -1;
-      } else {
-        scatterIds.push_back(j);
-      }
-    }
-  }
-
-  batchMachineIdVec_.clear();
-  batchMachineStartPos_.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  generator_.ids.clear();
-  for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(),
-                          finalPaths[i].ids.begin(),
-                          finalPaths[i].ids.end());
-    starts[i + 1] = generator_.ids.size();
-    batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                              finalPaths[i].machineIdVec.begin(),
-                              finalPaths[i].machineIdVec.end());
-  }
-}
-
-void RecurrentGradientMachine::connectPrevFrame(int stepId,
-                                                std::vector<Path>& paths) {
-  int machineCur = stepId % 2;
-  int machinePrev = (stepId - 1) % 2;
-  int beam = getBeamSize();
-  machineIds_.clear();
-  topIds_.clear();
-  seqIds_.clear();
-
-  for (size_t j = 0; j < paths.size(); ++j) {
-    machineIds_.push_back(paths[j].machineId);
-    topIds_.push_back(paths[j].machineId * beam + paths[j].topIndex);
-    seqIds_.push_back(paths[j].seqId);
-  }
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    bool isOutIds = (memoryFrameLine.layerName == outFrameLines_[0].layerName);
-    auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-        memoryFrameLine.scatterAgents[machineCur].get());
-    scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                               isOutIds ? topIds_ : machineIds_);
-    scatterAgent->forward(PASS_TEST);
-    NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                           memoryFrameLine.scatterAgents[machineCur]);
-  }
-}
-
-void RecurrentGradientMachine::forwardFrame(int machineCur) {
-  // forward
-  const std::vector<Argument> inArgs;
-  std::vector<Argument> outArgs;
-  frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-  copyDataOutlinkFrame(machineCur);
-
-  IVectorPtr& ids = outFrameLines_[0].frames[machineCur]->getOutput().ids;
-  MatrixPtr in = outFrameLines_[0].frames[machineCur]->getOutput().in;
-  IVectorPtr& eos = eosFrameLine_->layers[machineCur]->getOutput().ids;
-  if (useGpu_) {
-    IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
-    cpuId_->copyFrom(*ids);
-    Matrix::resizeOrCreate(cpuProb_,
-                           in->getHeight(),
-                           in->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    cpuProb_->copyFrom(*in);
-    IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
-    cpuEos_->copyFrom(*eos);
-  } else {
-    cpuId_ = ids;
-    cpuProb_ = in;
-    cpuEos_ = eos;
-  }
-}
-
-void RecurrentGradientMachine::singlePathExpand(Path& curPath,
-                                                size_t curPathId,
-                                                std::vector<Path>& newPaths,
-                                                size_t expandWidth) {
-  int calc_id =
-      gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
-
-  const int* idVec = cpuId_->getData();
-  const real* probMat = cpuProb_->getData();
-  const int* eosVec = cpuEos_->getData();
-
-  for (size_t k = 0; k < expandWidth; k++) {
-    int index = curPathId * expandWidth + k;
-    int id = idVec[index];
-    real prob = probMat[index];
-    /*
-     * Ordinarily, beam search greedily expands the most promising expandWidth
-     * paths that currently are ALWAYS returned by MaxIdLayer.
-     * In one condition, if user customizes the beam search procedure by
-     * restricting the expansion within a user defined subset,
-     * as a result, MaxIdLayer possibly COULD NOT return expandWidth
-     * vaild expansions, and it will use -1 to indicate the end of valid
-     * expansion candidates.
-     */
-    if (id == -1) break;
-
-    real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(
-        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
-    if (this->beamSearchCtrlCallbacks_) {
-      if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
-              newPath.seqId, newPath.ids, newPath.probHistory))
-        return;
-    }
-    // outFrameLines_.size() > 1UL
-    if (dataArgsSize_) {
-      newPath.machineIdVec = curPath.machineIdVec;
-      newPath.machineIdVec.push_back(curPathId);
-    }
-    bool atEos =
-        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
-    // adjustNewPath
-    newPath.adjustProb(calc_id, atEos);
-    if (this->beamSearchCtrlCallbacks_) {
-      this->beamSearchCtrlCallbacks_->normOrDropNode(
-          newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
-    }
-    if (!newPath.isDropable()) {
-      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
-            : newPaths.push_back(newPath);
-    }
-  }  // for expandWidth
-
-  if (gDiyProbStop) {
-    gDiyProbStop(calc_id);
-  }
-}
-
-void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
-                                          std::vector<Path>& newPaths) {
-  size_t candidatePathCount = paths.size();
-  // idVec.size() could be larger than candidatePathCount * beam,
-  // so user can drop some node customly.
-  CHECK_EQ(cpuId_->getSize() % candidatePathCount, 0UL);
-  size_t expandWidth = cpuId_->getSize() / candidatePathCount;
-
-  // iterate over each sequence
-  size_t totalExpandCount = 0;
-  int prevSeqId = -1;
-  int curSeqId = 0;
-  for (size_t j = 0; j <= candidatePathCount; j++) {
-    // expansions of a single sequence are all processed
-    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
-    if (prevSeqId != -1 && curSeqId != prevSeqId) {
-      totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
-    }
-    if (j == candidatePathCount) return;
-    singlePathExpand(paths[j], j, newPaths, expandWidth);
-
-    prevSeqId = paths[j].seqId;
-  }  // for paths
-}
-
-// Drop extra nodes to beam size.
-size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
-                                            size_t seqId,
-                                            size_t totalExpandCount) {
-  size_t minNewPathSize =
-      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
-  if (!minNewPathSize) {
-    return 0;
-  }
-  std::nth_element(newPaths.begin() + totalExpandCount,
-                   newPaths.begin() + totalExpandCount + minNewPathSize,
-                   newPaths.end(),
-                   Path::greaterPath);
-  newPaths.resize(totalExpandCount + minNewPathSize);
-
-  real minPathLogProb =
-      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-  real maxPathLogProb =
-      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-
-  // Remove the already formed paths that are relatively short
-  finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(),
-                     finalPaths_[seqId].end(),
-                     [&](Path& p) { return p.logProb < minPathLogProb; }),
-      finalPaths_[seqId].end());
-  for (auto p : finalPaths_[seqId]) {
-    if (minFinalPathLogProb_[seqId] > p.logProb) {
-      minFinalPathLogProb_[seqId] = p.logProb;
-    }
-  }
-
-  if (finalPaths_[seqId].size() >= getBeamSize() &&
-      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
-    newPaths.resize(totalExpandCount);
-    return 0;
-  }
-  return minNewPathSize;
-}
-
-void RecurrentGradientMachine::fillGenOutputs() {
-  size_t numResults = generator_.config.num_results_per_sample();
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
-    std::partial_sort(finalPaths_[i].begin(),
-                      finalPaths_[i].begin() + minFinalPathsSize,
-                      finalPaths_[i].end(),
-                      Path::greaterPath);
-    finalPaths_[i].resize(minFinalPathsSize);
-  }
-
-  generator_.ids.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  if (numResults > 1) {
-    int idsProbSaveSize = 0;
-    for (auto inSeq : finalPaths_) {
-      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
-      idsProbSaveSize += inSeq.size();
-    }
-    Matrix::resizeOrCreate(
-        generator_.outArg.value, idsProbSaveSize, 1, false, false);
-    real* idsProb = generator_.outArg.value->getData();
-
-    real* probs = generator_.outArg.in->getData();
-    size_t curPos = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        Path& path = finalPaths_[i][j];
-        size_t genLen = path.ids.size();
-        generator_.ids.push_back(genLen);  // sequence size
-        generator_.ids.insert(
-            generator_.ids.end(), path.ids.begin(), path.ids.end());
-        generator_.ids.push_back(-1);  // end of sequence
-
-        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
-        curPos += genLen;
-        idsProb[curPos++] = -1.0;
-        probs[i * numResults + j] = path.logProb;
-      }
-      starts[i + 1] = generator_.ids.size();
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      CHECK(!finalPaths_[i].empty());
-      Path& path = finalPaths_[i][0];
-      generator_.ids.insert(
-          generator_.ids.end(), path.ids.begin(), path.ids.end());
-      starts[i + 1] = starts[i] + path.ids.size();
-    }
-  }
-}
-
-void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    Argument outFrame;
-    outFrame.resizeAndCopyFrom(
-        outFrameLines_[i + 1].frames[machineCur]->getOutput(), useGpu_);
-    dataArgsFrame_[i].emplace_back(outFrame);
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
-    bool isSeq, std::vector<Argument>& outArgs) {
-  batchMachineIdVec_.clear();
-
-  size_t seqIdx = 0;
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
-      if (isSeq) {
-        for (size_t i = 0; i < machineIdVec.size(); ++i) {
-          size_t rowId = machineIdVec[i];
-          int* seqPos =
-              outArgs[i].sequenceStartPositions->getMutableData(false);
-          batchMachineIdVec_.push_back(seqPos[rowId]);
-        }
-      } else {
-        batchMachineIdVec_.insert(
-            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
-      }
-      seqIdx++;
-    }
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
-    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
-  size_t totalSeqNum = std::accumulate(
-      finalPaths_.begin(),
-      finalPaths_.end(),
-      0UL,
-      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
-  copySize.resize(totalSeqNum, 1);
-
-  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
-  if (isSeq) {
-    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
-    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
-             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
-    int* starts = inputSeqStartPos->getMutableData(false);
-    int seqId = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
-                                            : starts[j + 1] - starts[j];
-        batchMachineStartPos_[seqId + 1] =
-            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
-        seqId++;
-      }
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
-      batchMachineStartPos_[i + 1] =
-          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlink() {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    bool isSeq = dataArgsFrame_[i][0].hasSeq();
-    std::vector<int> copySize;
-    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
-    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
-
-    dataArgs_[i].concat(dataArgsFrame_[i],
-                        batchMachineIdVec_,
-                        batchMachineStartPos_,
-                        copySize,
-                        useGpu_,
-                        HPPL_STREAM_1,
-                        PASS_TEST);
-    auto dataAgent =
-        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
-    CHECK_NOTNULL(dataAgent);
-    dataAgent->setData(dataArgs_[i]);
-  }
-}
-
-void RecurrentGradientMachine::beamSearch(size_t batchSize) {
-  finalPaths_.clear();
-  finalPaths_.resize(batchSize);
-  seqIds_.resize(batchSize);
-  minFinalPathLogProb_.clear();
-  minFinalPathLogProb_.resize(batchSize, 0);
-
-  std::vector<Path> paths;
-  std::vector<Path> newPaths;
-  for (size_t i = 0; i < batchSize; ++i) {
-    paths.push_back(Path(i));
-    if (this->beamSearchCtrlCallbacks_) {
-      paths.back().recordHistory();
-    }
-  }
-
-  // restart beam search
-  stopBeamSearch_ = false;
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    int machineCur = i % 2;
-    std::unique_ptr<
-        ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&, int>>
-        statisticsBlock;
-    if (this->beamSearchStatistics_) {
-      auto ptr =
-          new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
-                              int>(beamSearchStatistics_->onEachStepStarted,
-                                   beamSearchStatistics_->onEachStepStoped,
-                                   i);
-      statisticsBlock.reset(ptr);
-    }
-    if (stopBeamSearch_) break;
-
-    if (i) connectPrevFrame(i, paths);
-
-    if (this->beamSearchCtrlCallbacks_) {
-      std::vector<std::vector<int>*> prefixes;
-      prefixes.resize(paths.size());
-      std::transform(
-          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
-            return const_cast<std::vector<int>*>(&p.ids);
-          });
-      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
-          prefixes, frames_[machineCur].get(), i);
-    }
-
-    forwardFrame(machineCur);
-    beamExpand(paths, newPaths);
-    if (newPaths.empty()) break;
-
-    paths = newPaths;
-    newPaths.clear();
-  }  // end for machineCur
-  fillGenOutputs();
-}
-
-void RecurrentGradientMachine::Path::adjustProb(int calc_id, bool atEos) {
-  if (gDiyProbMethod) {
-    logProb = gDiyProbMethod(calc_id, ids.size(), ids.data(), logProb, atEos);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
deleted file mode 100644
index 0a13d4f6f..000000000
--- a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include "GradientMachine.h"
-#include "NeuralNetwork.h"
-
-#include "paddle/legacy/utils/Locks.h"
-
-namespace paddle {
-
-/**
- * Private data class declares.
- * Used for user customized beam search.
- */
-class BeamSearchControlCallbacks;
-class BeamSearchStatisticsCallbacks;
-
-class RecurrentGradientMachine : public NeuralNetwork {
- public:
-  RecurrentGradientMachine(const std::string& subModelName,
-                           NeuralNetwork* rootNetwork);
-
-  // Disable copy and assign.
-  RecurrentGradientMachine(const RecurrentGradientMachine& other) = delete;
-  RecurrentGradientMachine& operator=(const RecurrentGradientMachine& other) =
-      delete;
-
-  virtual ~RecurrentGradientMachine() {
-    this->removeBeamSearchStatisticsCallbacks();
-    this->removeBeamSearchControlCallbacks();
-  }
-
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback,
-                    const std::vector<ParameterType>& parameterTypes,
-                    bool useGpu);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual void resetState() {}
-  virtual void eval(Evaluator* evaluator) const;
-
-  const std::vector<int>& getParameterIds() { return parameterIds_; }
-
-  /**
-   * @brief BeamSearchCandidatesAdjustCallback
-   *
-   * Adjust searching candidates to restrict beam search
-   * searching within a limited subset of all possibile paths.
-   *
-   * The first parameter is the prefixes of all formed paths in current
-   * beam search step, whose type is basically int[][].
-   *
-   * The second parameter is a pointer to the network used to generate sequence,
-   * user can use this pointer to tranverse each layer in the network to
-   * modify behaivors of a particular layer.
-   *
-   * The third parameter is an integer to indicate the iteration number of
-   * beam search, so that user can customize different operations in different
-   * beam search iterations.
-   */
-  typedef std::function<void(
-      const std::vector<std::vector<int>*>&, NeuralNetwork*, const int)>
-      BeamSearchCandidatesAdjustCallback;
-
-  /**
-   * @brief DropCallback
-   *
-   * Drop a whole prefix or one candidate in beam search or not.
-   *
-   * The first parameter is sequence index in a batch
-   *
-   * The second parameter is one path in beam search,
-   * which is made up of node indices.
-   *
-   * The third parameter is probabilites for each node in this path.
-   *
-   * Return true if this prefix or candidate is expected to be dropped.
-   */
-  typedef std::function<bool(
-      int seqId, const std::vector<int>&, const std::vector<real>&)>
-      DropCallback;
-
-  /**
-   * @brief NormOrDropNodeCallback
-   *
-   * Normalize a path's probabilities or just drop it by modifying path.logProb
-   *
-   * The first parameter is sequence index in a batch
-   *
-   * The second parameter is path.ids
-   *
-   * The third parameter is probabilites for each node in this path.
-   *
-   * The fourth parameter is the probability of the whole path.
-   */
-  typedef std::function<void(
-      int seqId, const std::vector<int>&, std::vector<real>&, real*)>
-      NormOrDropNodeCallback;
-
-  /**
-   * @brief Register beam search control callbacks. Used for prediction.
-   *
-   * @param queryBeamSearch: Give the sequences already formed, return the
-   * nodes expected to be expanded.
-   * Input: A pointer to an array holding pathes which have been expanded
-   * Return: A pointer to an array holding nodes wanted to be expanded.
-   *
-   * @param dropOneNode: Early drop a node in one beam search step.
-   * Given the path formed and probability history, decide whether a node
-   * should be dropped or not.
-   *
-   * @param stopBeamSearch: Early stop a path in one beam search step.
-   * Given the path and probability history, decide whether a path
-   * should be dropped or not.
-   */
-  void registerBeamSearchControlCallbacks(
-      const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
-      const NormOrDropNodeCallback& normOrDropNode,
-      const DropCallback& stopBeamSearch);
-
-  /**
-   * @brief Remove user costumized beam search callbacks,
-   *
-   * make sequence generation acts like normal beam search.
-   */
-  void removeBeamSearchControlCallbacks();
-
-  /**
-   * @brief EachStepCallback
-   *
-   * Invoke with beam search step.
-   */
-  typedef std::function<void(int)> EachStepCallback;
-
-  /**
-   * @brief register statistics methods for performance profile of beam search.
-   *
-   * @param onEachStepStarted: invoke once a beam search step starts.
-   * Its input is index of the beam search step.
-   *
-   * @param onEachStepStoped: invoke once a beam search step ends.
-   * Its input is index of the beam search step.
-   */
-  void registerBeamSearchStatisticsCallbacks(
-      const EachStepCallback& onEachStepStarted,
-      const EachStepCallback& onEachStepStoped);
-
-  /**
-   * @brief Remove beam search callbacks.
-   */
-  void removeBeamSearchStatisticsCallbacks();
-
-  /**
-   * @brief Stop beam search for current source.
-   *
-   * Will restart beam search in the next forward
-   */
-  void stopBeamSearch();
-
-  struct Path {
-    /**
-     * @brief ids, path of beam search.
-     */
-    std::vector<int> ids;
-
-    /**
-     * @brief idsProb, log probability of each generated word.
-     */
-    std::vector<real> idsProb;
-
-    /**
-     * @brief logProb, current probability of path.
-     */
-    real logProb;
-
-    int machineId;  // index of sample in frame
-    int topIndex;   // index of MaxIdLayer output in one sample
-    int seqId;      // index of sequence in batch generation
-    std::vector<int> machineIdVec;
-
-    /**
-     * @brief A record of each node's probality in a formed path in beam search.
-     *
-     * @note  It could be empty when history is not recorded. If the history is
-     *        wanted to be recorded, recordHistory() MUST be invoked first.
-     */
-    std::vector<real> probHistory;
-
-    /**
-     * @brief Path default ctor, first logProb is 0.
-     */
-    Path() {
-      logProb = 0;
-      seqId = 0;
-    }
-    explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }
-
-    /**
-     * @brief Create a new path based on an old path and
-     * a new node with probability.
-     *
-     * @param old       old path
-     * @param newId     index of the new node
-     * @param logProb   probability of the new node.
-     * @param machineId sample index of a frame in RNN
-     * @param topIndex  index of MaxIdLayer output in one sample
-     */
-    Path(Path& old, int newId, real logProb, int machineId, int topIndex)
-        : ids(old.ids),
-          idsProb(old.idsProb),
-          logProb(old.logProb + logProb),
-          machineId(machineId),
-          topIndex(topIndex),
-          seqId(old.seqId) {
-      ids.push_back(newId);
-      idsProb.push_back(logProb);
-      if (!old.probHistory.empty()) {
-        this->probHistory = old.probHistory;
-        // probHistory store current prob, not sum
-        this->probHistory.push_back(logProb);
-      }
-    }
-
-    /**
-     * @brief operator <
-     *
-     * Path a < Path b means log probability of a is smaller than that of b
-     */
-    bool operator<(const Path& other) const {
-      return (logProb < other.logProb);
-    }
-
-    static bool greaterPath(const Path& a, const Path& b) { return (b < a); }
-
-    /**
-     * @brief Start recording history in this path.
-     */
-    void recordHistory() { this->probHistory.push_back(this->logProb); }
-
-    /**
-     * @brief Adjust probability for DIY beam search interface.
-     * In normal situation, it will do nothing.
-     *
-     * @param calc_id: the object id for DIY beam search interface.
-     * @param atEos: at end of sequence or not.
-     */
-    void adjustProb(int calc_id, bool atEos = false);
-
-    /**
-     * @brief isDropable indacating whether the current node will be
-     * dropped or not in beam search.
-     *
-     * @note: if logProb is -inf, current node will be dropped.
-     * @return true to drop the current node.
-     */
-    bool isDropable() const { return std::isinf(logProb) && logProb < 0; }
-  };
-
-  /**
-   * @brief access beam search results.
-   * @return beam search results.
-   */
-  const std::vector<std::vector<Path>>& getFinalPaths() const {
-    return this->finalPaths_;
-  }
-
- protected:
-  std::vector<Argument::SeqInfo> commonSeqInfo_;
-  ICpuGpuVectorPtr sequenceStartPositions_;
-  void calcSequenceStartPositions();
-  void checkInputConsistency(int inlinkId,
-                             const std::vector<Argument::SeqInfo>& seqInfo);
-  void reorganizeInput(PassType passType);
-  void reorganizeOutput(PassType passType);
-  void connectFrames(PassType passType);
-  void calcNumSequencesAtEachStep();
-
-  void resizeOrCreateFrames(int numFrames);
-  void resizeBootFrame(int numSequences);
-
-  void generateSequence();
-  void oneWaySearch(size_t batchSize);
-  void beamSearch(size_t batchSize);
-
-  struct InFrameLine {
-    std::string linkName;
-    LayerPtr inLayer;
-    std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
-    Argument outArg;               // scatter output argument
-  };
-  std::vector<InFrameLine> inFrameLines_;
-
-  struct OutFrameLine {
-    std::string layerName;
-    LayerPtr agentLayer;
-    std::vector<LayerPtr> frames;
-  };
-  std::vector<OutFrameLine> outFrameLines_;
-
-  struct MemoryFrameLine {
-    std::string layerName;
-    std::string linkName;
-    LayerPtr bootLayer;  // actually used biasLayer or rootAgent
-    LayerPtr biasLayer;
-    LayerPtr rootLayer;  // layer in root network to boot this memory
-    LayerPtr rootAgent;  // agent to link rootLayer
-    std::vector<LayerPtr> frames;
-    std::vector<LayerPtr> agents;
-    std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
-    Argument outArg;                      // scatter output argument
-    // Different memoryFrameLine have different element as follows
-    IVectorPtr allIds;  // scattered id of realLayer
-    ICpuGpuVectorPtr
-        sequenceStartPositions;  // scattered sequenceStartPositions
-  };
-  std::vector<MemoryFrameLine> memoryFrameLines_;
-
-  // Each inFrameLines(inlinks) has its own info(elements) below,
-  // and all outFrameLines(outlinks) share the info with one inFrameLine,
-  // which is assigned by targetInfoInlinkId_.
-  struct Info {
-    // The original positions in the original batch
-    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
-
-    // index of allIds for each step [maxSequenceLength_]
-    // idIndex[i] is the total length of the first i sequences
-    std::vector<int> idIndex;
-
-    ICpuGpuVectorPtr
-        sequenceStartPositions;         // scattered sequenceStartPositions
-    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
-  };
-  std::vector<Info> info_;  // for input
-
-  // numSeqs_[i] is the number sequences which is longer than i (for sequence
-  // data) or has more than i subsequences (for subsequence data)
-  // Equivalently, numSeqs_[i] is the number of sequences at step i;
-  std::vector<int> numSeqs_;
-
-  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
-
-  void checkOutputConsistency(OutFrameLine& outFrameLine);
-
-  /* create scattered id infomation for all realLayer of inFrameLines one time.
-   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
-   *  for all realLayer of inFrameLines one time.
-   */
-  void createInFrameInfo(int inlinks_id,
-                         const Argument& input,
-                         PassType passType);
-  void createInFrameInfo_nonseq(int inlinks_id,
-                                const Argument& input,
-                                PassType passType);
-  void createInFrameInfo_seq(int inlinks_id,
-                             const Argument& input,
-                             PassType passType);
-  void createInFrameInfo_subseq(int inlinks_id,
-                                const Argument& input,
-                                PassType passType);
-
-  void createOutFrameInfo(OutFrameLine& outFrameLine,
-                          Info& info,
-                          ICpuGpuVectorPtr& sequenceStartPositions,
-                          ICpuGpuVectorPtr& subSequenceStartPositions);
-  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
-                              Info& info,
-                              ICpuGpuVectorPtr& sequenceStartPositions,
-                              ICpuGpuVectorPtr& subSequenceStartPositions);
-  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
-                                 Info& info,
-                                 ICpuGpuVectorPtr& sequenceStartPositions,
-                                 ICpuGpuVectorPtr& subSequenceStartPositions);
-
-  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
-                             PassType passType);
-
-  void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
-
-  void selectRowsOneTime(LayerPtr layer,
-                         const IVectorPtr& allIds,
-                         Argument* arg,
-                         PassType passType);
-
-  void createSeqPos(const std::vector<int>& sequenceStartPosition,
-                    ICpuGpuVectorPtr* sequenceStartPositions);
-
-  // for generator
-  struct EosFrameLine {
-    std::vector<LayerPtr> layers;
-  };
-  std::unique_ptr<EosFrameLine> eosFrameLine_;
-
-  struct Generator {
-    GeneratorConfig config;
-    std::vector<int> ids;       // store generated sequences
-    std::vector<real> idsProb;  // log probability of each generated word
-    Argument outArg;            // final output argument
-  };
-  bool generating_;
-  Generator generator_;
-
-  std::vector<std::unique_ptr<NeuralNetwork>> frames_;
-
-  NeuralNetwork* rootNetwork_;
-  bool reversed_;
-
-  int maxSequenceLength_;  // Max top-level length
-  bool useGpu_;
-  bool stopBeamSearch_;
-
-  std::vector<int>
-      parameterIds_;  // parameters actually used by this Layer Group
-
-  // store final argument of outFrameLines_
-  std::vector<Argument> dataArgs_;
-  // store each frame's output argument of outFrameLines_
-  std::vector<std::vector<Argument>> dataArgsFrame_;
-  size_t dataArgsSize_;  // size of dataArgs_ = size of dataArgsFrame_
-
-  IVectorPtr cpuId_;
-  MatrixPtr cpuProb_;
-  IVectorPtr cpuEos_;
-
- private:
-  /*
-   * @return beam size in beam search
-   */
-  size_t getBeamSize() { return generator_.config.beam_size(); }
-
-  /*
-   * @return number of sequence in a batch in generation
-   */
-  size_t getGenBatchSize();
-
-  /*
-   * @brief store output of the machineCur-th frame during generation, for
-   * creating the final outlink after the entire generation process is finished.
-   *
-   * In generation, if the layer group has more than 1 outlink, the first
-   * one is reserved to store the generated word indices, the others are data
-   * outlinks, that can be used like a common layer in the network.
-   *
-   * @param machineCur : index to access the layer group frame in
-   * currrent generation step.
-   */
-  void copyDataOutlinkFrame(size_t machineCur);
-
-  /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlink
-   * except the first one is a data outlink. In RecurrentLayerGroup, each time
-   * step is a separate Network, outputs of a layer inside the
-   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
-   * specified as an outlink of RecurrentLayerGroup. This function will
-   * collect outputs in each time step of each generated sequence which are
-   * dispersed in separate Arguments to form a new single Argument as output of
-   * RecurrentLayerGroup.
-   */
-  void createDataOutlink();
-
-  /*
-   * @brief decide to select how many rows from the Matrix stored the forward
-   * pass results from a start position.
-   *
-   * @param isSeq: a flag indicating whetehr the layer to be output of the
-   * RecurrentGradientMachine is a sequence or not
-   * @param outArgs: all of the the returned Arguments of the forward pass
-   * during the generation process.
-   * @param copySize: the returned result, number of rows to select from the
-   * Matrix stored the forward pass results from a start position.
-   */
-  void createDataOutlinkCopySizeInfo(bool isSeq,
-                                     std::vector<Argument>& outArgs,
-                                     std::vector<int>& copySize);
-
-  /*
-   * @brief decide index of the start row for each time step of a generated
-   * sequence in Matrix stored the entire beam search batch's forward pass
-   * results.
-   *
-   * @param isSeq: a flag indicating whether the layer to be output of the
-   * RecurrentGradientMachine is a sequence or not
-   * @param outArgs: all of the returned Arguments of the forward pass
-   * during the generation process.
-   */
-  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
-
-  /*
-   * @brief used in beam search, connect previous frame to form recurrent link
-   * @param stepId : iteration number of generation process.
-   * It equals to the length of longest half-generated sequence.
-   * @param paths : half-generated paths that are going to be expanded
-   * in current beam search iteration.
-   */
-  void connectPrevFrame(int stepId, std::vector<Path>& paths);
-
-  /*
-   * @brief used in beam search, forward current recurrent frame
-   * @param machineCur : index to access the layer group frame in
-   * currrent generation step.
-   */
-  void forwardFrame(int machineCur);
-
-  /*
-   * @brief reduce all expanded paths to beam size.
-   *
-   * @param newPaths : newPaths[totalExpandCount : ] stores all expanded paths
-   * for the seqId-th sequence
-   * @param seqId : sequence index in a batch
-   * @param totalExpandCount : number of already shrinked paths in newPaths
-   * @return size of retained paths at the end of a beam search iteration
-   */
-  size_t beamShrink(std::vector<Path>& newPaths,
-                    size_t seqId,
-                    size_t totalExpandCount);
-
-  /*
-   * @brief expand a single path to expandWidth new paths
-   * with highest probability
-   * @param curPath : path to be expanded
-   * @param curPathId : index of curPath in member newPaths
-   * @param expandWidth : number of paths to be expanded
-   */
-  void singlePathExpand(Path& curPath,
-                        size_t curPathId,
-                        std::vector<Path>& newPaths,
-                        size_t expandWidth);
-
-  /*
-   * @brief A new beam search iteration. Each half-generated paths in previous
-   * beam search iteration are further expanded to beam_size new paths
-   * with highest probabilities, and then all the expanded paths are again
-   * reduced to beam_size paths according to their log probabilities.
-   * @param paths : half-generated paths in previous iteration.
-   * @param newPaths : paths expanded and then reduces in current iteration.
-   */
-  void beamExpand(std::vector<Path>& paths, std::vector<Path>& newPaths);
-
-  /*
-   * @brief fill sequence start positions and some other information that are
-   * uesed by the "text_printer" evaluator.
-   */
-  void fillGenOutputs();
-
-  std::vector<int> machineIds_;
-  std::vector<int> topIds_;
-  std::vector<int> seqIds_;
-  std::vector<int> batchMachineIdVec_;
-  std::vector<int> batchMachineStartPos_;
-  std::vector<std::vector<Path>> finalPaths_;
-  std::vector<real> minFinalPathLogProb_;
-  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
-  BeamSearchStatisticsCallbacks* beamSearchStatistics_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.cpp b/paddle/legacy/gserver/layers/AddtoLayer.cpp
deleted file mode 100644
index 39c5603d9..000000000
--- a/paddle/legacy/gserver/layers/AddtoLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AddtoLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(addto, AddtoLayer);
-
-bool AddtoLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void AddtoLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-
-  reserveOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    MatrixPtr input = getInputValue(i);
-    i == 0 ? outV->assign(*input) : outV->add(*input);
-  }
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void AddtoLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    /* Calculate the input layers error */
-    MatrixPtr preGrad = getInputGrad(i);
-    if (NULL != preGrad) {
-      preGrad->add(*getOutputGrad());
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.h b/paddle/legacy/gserver/layers/AddtoLayer.h
deleted file mode 100644
index ad3cefe1a..000000000
--- a/paddle/legacy/gserver/layers/AddtoLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * This layer just simply add all input layers together, then activate
- * the sum inputs. Each input of this layer should be the same size,
- * which is also the output size of this layer.
- * \f[
- *   y=f(\sum_{i}x_i + b)
- * \f]
- * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
- * activation function.
- *
- * The config file api is addto_layer.
- */
-class AddtoLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AddtoLayer() {}
-
-  /**
-   * Intialization of AddtoLayer.
-   */
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * Forward propagation.
-   * @note There is no weight matrix for each input,
-   *       because it just a simple add operation.
-   */
-  void forward(PassType passType) override;
-
-  /**
-   * Backward propagation.
-   */
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AgentLayer.cpp b/paddle/legacy/gserver/layers/AgentLayer.cpp
deleted file mode 100644
index bae89b2fa..000000000
--- a/paddle/legacy/gserver/layers/AgentLayer.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AgentLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(agent, AgentLayer);
-
-bool AgentLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void AgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  Argument& realOutput = realLayer_->getOutput();
-  int realNumSequences = realOutput.getNumSequences();
-  CHECK_LE(numSamples_, realNumSequences);
-
-  // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    if (realOutput.hasSeq()) {
-      int numRows =
-          realOutput.sequenceStartPositions->getData(false)[numSamples_];
-      output_.subArgFrom(realOutput,
-                         /* offset */ 0,
-                         numRows,
-                         getSize(),
-                         useGpu_,
-                         /* trans */ false,
-                         /* seqFlag */ true,
-                         /* seqStart */ 0,
-                         /* seqSize */ numSamples_ + 1);
-    } else {
-      output_.subArgFrom(
-          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
-    }
-  } else {
-    output_ = realOutput;
-  }
-}
-
-bool GatherAgentLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void GatherAgentLayer::copyIdAndSequenceInfo(
-    ICpuGpuVectorPtr sequenceStartPositions,
-    ICpuGpuVectorPtr subSequenceStartPositions,
-    const IVectorPtr& ids,
-    const std::vector<int>& idIndex) {
-  output_.sequenceStartPositions = sequenceStartPositions;
-  output_.subSequenceStartPositions = subSequenceStartPositions;
-  allIds_ = ids;
-  idIndex_ = idIndex;
-}
-
-void GatherAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  forwardIds(passType);
-  forwardValue(passType);
-}
-
-void GatherAgentLayer::forwardValue(PassType passType) {
-  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
-  if (!valueReal) return;
-
-  int height = allIds_->getSize();
-  int width = this->getSize();
-  resetOutput(height, width);
-  idsVec_.resize(idIndex_.size());
-
-  const MatrixPtr& outV = getOutputValue();
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const MatrixPtr& realV = realLayers_[i]->getOutputValue();
-    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realV->getHeight(),
-                                 useGpu_);
-    realV->addToRows(*outV, *idsVec_[i]);
-  }
-}
-
-namespace {
-
-// dest[index[i]] <- src[i] for each i
-void copyElements(const IVector& srcVec,
-                  const IVector& indexVec,
-                  IVector& destVec) {
-  const int* src = srcVec.getData();
-  const int* index = indexVec.getData();
-  int* dest = destVec.getData();
-  int len = indexVec.getSize();
-  CHECK_EQ(srcVec.getSize(), indexVec.getSize());
-  for (int i = 0; i < len; ++i) {
-    dest[index[i]] = src[i];
-  }
-}
-}  // namespace
-
-void GatherAgentLayer::forwardIds(PassType passType) {
-  IVectorPtr realId = realLayers_[0]->getOutputLabel();
-  if (!realId) return;
-
-  IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
-  IVectorPtr outId = output_.ids;
-  idsVec_.resize(idIndex_.size());
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
-    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realId->getSize(),
-                                 useGpu_);
-    execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
-  }
-}
-
-void GatherAgentLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  const MatrixPtr& outputGrad = getOutputGrad();
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const MatrixPtr& realG = realLayers_[i]->getOutputGrad();
-    if (realG) {
-      realG->selectRows(*outputGrad, *idsVec_[i]);
-    }
-  }
-}
-
-bool ScatterAgentLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void ScatterAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-
-  int width = this->getSize();
-  if (selectionMode_) {
-    forwardWithSelection(passType);
-  } else {
-    if (realOutArg_.hasSeq()) {
-      output_.subArgFrom(realOutArg_,
-                         /* offset */ idIndex_,
-                         idSize_,
-                         width,
-                         useGpu_,
-                         /* trans */ false,
-                         /* seqFlag */ true,
-                         /* seqStart */ seqStartPosIndex_,
-                         /* seqSize */ numSequences_);
-    } else {
-      output_.subArgFrom(
-          realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
-    }
-  }
-}
-
-void ScatterAgentLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  CHECK(!selectionMode_);
-
-  const MatrixPtr& outputGrad = realOutArg_.grad;
-  const MatrixPtr& realGrad = realLayer_->getOutputGrad();
-  if (realGrad) {
-    // for agent in inFrameLines and memoryFrameLines,
-    // only first scatterAgentLayer should do addToRows in backward
-    if (handleBackward_) {
-      outputGrad->addToRows(*realGrad, *ids_);
-    }
-  }
-}
-
-REGISTER_LAYER(gather_agent, GatherAgentLayer);
-REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
-
-void ScatterAgentLayer::forwardWithSelection(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-
-  const Argument& input = realLayer_->getOutput();
-  CHECK_EQ(realLayer_->getSize(), this->getSize());
-  int width = this->getSize();
-
-  AsyncGpuBlock asyncGpuBlock;
-  REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
-
-  if (!input.hasSeq()) {
-    if (realLayer_->getOutput().ids) {
-      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
-      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
-    }
-    if (realLayer_->getOutput().value) {
-      int height = ids_->getSize();
-      resetOutput(height, width);
-
-      const MatrixPtr& outV = getOutputValue();
-      const MatrixPtr& realV = realLayer_->getOutputValue();
-      outV->selectRows(*realV, *ids_);
-    }
-  } else {
-    // Putting the generation logic here is really an ugly hack!
-    // used in generation
-    int height = 0;
-    size_t numSequences = ids_->getSize();
-    const int* starts = input.getCpuStartPositions();
-    size_t size = input.hasSubseq() ? input.getNumSubSequences()
-                                    : input.getNumSequences();
-    const int* cpuIds = cpuIds_->getData();
-
-    for (size_t i = 0; i < numSequences; ++i) {
-      size_t seqId = cpuIds[i];
-      CHECK_LT(seqId, size);
-      height += starts[seqId + 1] - starts[seqId];
-    }
-    reserveOutput(height, width);
-
-    const MatrixPtr& outputValue = getOutputValue();
-
-    CHECK_NE(input.sequenceStartPositions.get(),
-             output_.sequenceStartPositions.get());
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences + 1, false);
-    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
-
-    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
-    int* inStarts = inputStartPos_->getMutableData(false);
-
-    size_t offsetOut = 0;
-    for (size_t i = 0; i < numSequences; ++i) {
-      outStarts[i] = offsetOut;
-      size_t seqId = cpuIds[i];
-      int size = starts[seqId + 1] - starts[seqId];
-      for (int j = 0; j < size; j++) {
-        inStarts[offsetOut + j] = starts[seqId] + j;
-      }
-      offsetOut += size;
-    }
-    outStarts[numSequences] = offsetOut;
-
-    outputValue->copyByRowIndex(*input.value,
-                                *inputStartPos_->getVector(useGpu_));
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AgentLayer.h b/paddle/legacy/gserver/layers/AgentLayer.h
deleted file mode 100644
index a05eac5e7..000000000
--- a/paddle/legacy/gserver/layers/AgentLayer.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * AgentLayer use as a virtual input of another layer in config,
- * before execute forward/backward, setRealLayer() should be
- * called to set one and only one real layer
- */
-class AgentLayer : public Layer {
- protected:
-  LayerPtr realLayer_;
-  int numSamples_;
-
- public:
-  explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // if *numSamples* set,
-  // real layer output will only use first *numSamples* rows
-  void setRealLayer(LayerPtr layer, int numSamples = 0) {
-    realLayer_ = layer;
-    numSamples_ = numSamples;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
-/**
- * Like AgentLayer, but it can gather many real layers. Each real
- * layer give a few rows of a sequence, after gather all real layers,
- * GatherAgentLayer collect a complete sequence.
- */
-class GatherAgentLayer : public Layer {
- protected:
-  std::vector<LayerPtr> realLayers_;
-  std::vector<IVectorPtr> idsVec_;
-  // we don't clear idsVec_ vector to aviod IVector alloc/free
-  IVectorPtr allIds_;
-  std::vector<int> idIndex_;
-
- public:
-  explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~GatherAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // call before addRealLayer
-  void clearRealLayers() { realLayers_.clear(); }
-
-  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
-                             ICpuGpuVectorPtr subSequenceStartPositions,
-                             const IVectorPtr& allIds,
-                             const std::vector<int>& idIndex);
-
-  // add one real layer, can call many times
-  void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  void forwardValue(PassType passType);
-  void forwardIds(PassType passType);
-};
-
-/**
- * Like AgentLayer, but only select a few rows in real layer.
- * [idIndex, idIndex + idSize) of *ids* in setRealLayerAndOutput()
- * are the selected row ids. It's used to scatter one layer's output
- * to many small submodels. ScatterAgentLayer can support ids real layer,
- * if it is, the agent will select a few ids in real layer.
- */
-class ScatterAgentLayer : public Layer {
- protected:
-  LayerPtr realLayer_;
-  IVectorPtr ids_;
-  IVectorPtr cpuIds_;
-  Argument realOutArg_;
-  int idIndex_;
-  int idSize_;
-  int seqStartPosIndex_;
-  int numSequences_;  // number of sequences in this scatterAgentLayer
-  bool handleBackward_;
-
-  // use to store expanded cpuStartPositions or subSequenceStartPositions
-  // of real layer.
-  ICpuGpuVectorPtr inputStartPos_;
-
-  // true for setRealLayer, false for setRealLayerAndOutput
-  bool selectionMode_;
-
- public:
-  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~ScatterAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * @brief set real layer in generation
-   *
-   * @param layer[input]    realLayer
-   * @param ids[input]      row id in real layer
-   * @param copyId[input]   whether to copy a cpu version of ids,
-   *                        false(default) in ScatterAgentLayer, and
-   *                        true in SequenceScatterAgentLayer.
-   */
-  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
-    realLayer_ = layer;
-    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
-    ids_->copyFrom(ids.data(), ids.size());
-    if (useGpu_) {
-      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
-      cpuIds_->copyFrom(ids.data(), ids.size());
-    } else {
-      cpuIds_ = ids_;
-    }
-    selectionMode_ = true;
-  }
-
-  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
-  // are selected row for realOutArg in realLayer
-  void setRealLayerAndOutput(LayerPtr layer,
-                             const Argument& outArg,
-                             const IVectorPtr& ids,
-                             int idIndex,
-                             int idSize,
-                             bool handleBackward) {
-    realLayer_ = layer;
-    realOutArg_ = outArg;
-    ids_ = ids;
-    idIndex_ = idIndex;
-    idSize_ = idSize;
-    handleBackward_ = handleBackward;
-    selectionMode_ = false;
-  }
-
-  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
-                                 int seqStartPosIndex,
-                                 int numSequences) {
-    realOutArg_.sequenceStartPositions = sequenceStartPositions;
-    seqStartPosIndex_ = seqStartPosIndex;
-    numSequences_ = numSequences;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  void forwardWithSelection(PassType passType);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AverageLayer.cpp b/paddle/legacy/gserver/layers/AverageLayer.cpp
deleted file mode 100644
index 0539da793..000000000
--- a/paddle/legacy/gserver/layers/AverageLayer.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AverageLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(average, AverageLayer);
-
-bool AverageLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  SequencePoolLayer::init(layerMap, parameterMap);
-
-  // average strategy
-  if (config_.average_strategy() == "average") {
-    mode_ = kAverage;
-  } else if (config_.average_strategy() == "sum") {
-    mode_ = kSum;
-  } else if (config_.average_strategy() == "squarerootn") {
-    mode_ = kAverageSquareRootN;
-  } else {
-    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
-  }
-  return true;
-}
-
-void AverageLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(
-      *inputValue, *startPositions_->getVector(useGpu_), mode_);
-
-  /* add the bias-vector AFTER average operation */
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void AverageLayer::backward(const UpdateCallback& callback) {
-  SequencePoolLayer::backward(callback);
-
-  if (getInputGrad(0)) {
-    getInputGrad(0)->sequenceAvgBackward(
-        *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AverageLayer.h b/paddle/legacy/gserver/layers/AverageLayer.h
deleted file mode 100644
index a0d457d35..000000000
--- a/paddle/legacy/gserver/layers/AverageLayer.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal average" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = average_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the average pooling
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-class AverageLayer : public SequencePoolLayer {
- public:
-  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  explicit AverageLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  int mode_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp b/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
deleted file mode 100644
index 4dcbd8dc2..000000000
--- a/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BatchNormBaseLayer.h"
-#include "BatchNormalizationLayer.h"
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-#ifdef PADDLE_WITH_CUDA
-#include "CudnnBatchNormLayer.h"
-#endif
-
-namespace paddle {
-
-bool BatchNormBaseLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  /* initialize the weightList */
-  // first is Input in configure
-  // other two is created in config_parser.py
-  CHECK_EQ(inputLayers_.size(), 3U);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  channels_ = conf.channels();
-  calFeatureMapSize();
-
-  if (config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-  movingAvgFraction_ = config_.moving_average_fraction();
-  epsilon_ = config_.epsilon();
-
-  weight_.reset(new Weight(1, channels_, parameters_[0]));
-  movingMean_.reset(new Weight(1, channels_, parameters_[1]));
-  movingVar_.reset(new Weight(1, channels_, parameters_[2]));
-
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, channels_, biasParameter_));
-  }
-
-  savedMean_ = Matrix::create(1, channels_, false, useGpu_);
-  savedInvVar_ = Matrix::create(1, channels_, false, useGpu_);
-  savedMean_->zeroMem();
-  savedInvVar_->zeroMem();
-
-  return true;
-}
-
-void BatchNormBaseLayer::calFeatureMapSize() {
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  imageD_ = inputLayers_[0]->getOutput().getFrameDepth();
-
-  if (0 == imageD_) imageD_ = conf.img_size_z();
-  if (imageH_ == 0 && imageW_ == 0) {
-    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-    imageW_ = conf.img_size();
-  } else {
-    getOutput().setFrameHeight(imageH_);
-    getOutput().setFrameWidth(imageW_);
-    getOutput().setFrameDepth(imageD_);
-  }
-  imgPixels_ = imageH_ * imageW_ * imageD_;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.h b/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
deleted file mode 100644
index 8dc1d7883..000000000
--- a/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief Batch normalization layer use to normalizes the input to across the
- * batch.
- *
- * By default, calculating global mean and variance statistics via a running
- * average in the training peroid. Then the pre-calculated global mean and
- * variance are used for testing.
- *
- * Moving mean and variance are located in Parameter object when constructing
- * and the calculation will change them. Now we only save global mean and
- * variance of one thread in first node for GPU.
- * But the calculation in CPU is different, because parameters are shared by
- * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
- * still save global mean and variance in first node in CPU when multi machine.
- *
- * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
- *     Training by Reducing Internal Covariate Shift." arXiv preprint
- *     arXiv:1502.03167 (2015).
- */
-
-class BatchNormBaseLayer : public Layer {
- public:
-  explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~BatchNormBaseLayer() {}
-
-  /**
-   * @brief Create BatchNorm layer by norm_type, including batch_norm and
-   * cudnn_batch_norm. If do not set norm_type, it will automatically select
-   * cudnn_batch_norm for GPU and batch_norm for CPU.
-   */
-  static Layer* create(const LayerConfig& config);
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * @brief Calculate feature map size. Some input uses frameHeight and
-   * frameWidth to store feature size
-   */
-  void calFeatureMapSize();
-
- protected:
-  /// Batch normalization scale parameter, which is referred to as gamma in
-  /// in original paper.
-  std::unique_ptr<Weight> weight_;
-  /// Moving average of mean.
-  std::unique_ptr<Weight> movingMean_;
-  /// Moving average of variance.
-  std::unique_ptr<Weight> movingVar_;
-  /// Batch normalization bias parameter, which is referred to as beta in
-  /// in original paper.
-  std::unique_ptr<Weight> biases_;
-
-  /// Save intermediate results computed during the forward pass,
-  /// these can then be reused to speed up the backward pass.
-  MatrixPtr savedMean_;
-  MatrixPtr savedInvVar_;
-
-  /// Height or width of input image feature.
-  /// Both of them are 1 if the input is fully-connected layer.
-  int imageD_;
-  int imageH_;
-  int imageW_;
-  /// Height * Width.
-  int imgPixels_;
-  /// Feature dimension. If the input layer is conv layer, it is the channels
-  /// of feature map of the conv layer. If the input layer is fully-connected
-  /// layer, it is the dimension of fc layer.
-  int channels_;
-  // if useGlobalStats_ is true, will use the loaded mean and variance.
-  // otherwise, calculate mean and variance in this mini-batch.
-  bool useGlobalStats_;
-  // use to compute moving mean and variance.
-  real movingAvgFraction_;
-  // Epsilon is a small random noise used in batch normalization for stability.
-  real epsilon_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp b/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
deleted file mode 100644
index 0297bd44c..000000000
--- a/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Stat.h"
-#ifdef PADDLE_WITH_CUDA
-#include "hl_batch_transpose.h"
-#endif
-#include "BatchNormalizationLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
-
-bool BatchNormalizationLayer::init(const LayerMap& layerMap,
-                                   const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
-
-  return true;
-}
-
-void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
-  int numSamples = mat->getHeight();
-  Matrix::resizeOrCreate(tmpMat_, numSamples, channels_, false, useGpu_);
-  savedMean_->zeroMem();
-  savedMean_->accumulateColSum(*mat);
-  savedMean_->mulScalar(1.0 / numSamples);  // E[x]
-
-  tmpMat_->assign(*mat);
-  tmpMat_->square2();
-  savedInvVar_->zeroMem();
-  savedInvVar_->accumulateColSum(*tmpMat_);
-  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
-  savedInvVar_->addSquare(*savedMean_, -1.0);  // E[x^2] - E^2[x]
-
-  // Variance may be small negative value
-  // because of the subtraction operation.
-  // Here using clipping.
-  savedInvVar_->downClip(real(0.0));
-
-  calMovingMeanAndVar();
-
-  savedInvVar_->subScalar(-epsilon_);
-  savedInvVar_->sqrt2(*savedInvVar_);
-}
-
-void BatchNormalizationLayer::calMovingMeanAndVar() {
-  // calculating and saving moving mean and variance
-  auto& movingMean = movingMean_->getW();
-  auto& movingVar = movingVar_->getW();
-  // movingMean =  movingMean * movingAvgFraction_
-  //            + savedMean_ * (1 - movingAvgFraction_)
-  movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  // movingVar =  movingVar * movingAvgFraction_
-  //           + savedInvVar_ * (1 - movingAvgFraction_)
-  movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-}
-
-void BatchNormalizationLayer::setMeanAndStd() {
-  savedMean_->copyFrom(*(movingMean_->getW()));
-  savedInvVar_->copyFrom(*(movingVar_->getW()));
-  savedInvVar_->downClip(real(0.0));
-
-  savedInvVar_->subScalar(-epsilon_);
-  savedInvVar_->sqrt2(*savedInvVar_);
-}
-
-void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
-  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
-  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_));
-  CHECK(!in->isTransposed());
-  CHECK(!out->isTransposed());
-  if (imgPixels_ == 1) {
-    out->assign(*in);
-    return;
-  }
-  size_t batchSize = in->getHeight();
-  CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
-  if (useGpu_) {
-#ifndef PADDLE_WITH_CUDA
-    LOG(FATAL) << "paddle is compiled only for cpu";
-#else
-    batchTranspose(
-        in->getData(), out->getData(), imgPixels_, channels_, batchSize);
-#endif
-  } else {
-    for (size_t i = 0; i < batchSize; i++) {
-      const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * imgPixels_ * channels_,
-                         channels_,
-                         imgPixels_,
-                         false,
-                         useGpu_);
-      MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         imgPixels_,
-                         channels_,
-                         false,
-                         useGpu_);
-      inTmp->transpose(outTmp, false);
-    }
-  }
-}
-
-void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
-  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_));
-  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
-  size_t batchSize = out->getHeight();
-  CHECK(!in->isTransposed());
-  CHECK(!out->isTransposed());
-  if (imgPixels_ == 1) {
-    out->assign(*in);
-    return;
-  }
-  CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
-  if (useGpu_) {
-#ifndef PADDLE_WITH_CUDA
-    LOG(FATAL) << "paddle is compiled only for cpu";
-#else
-    batchTranspose(
-        in->getData(), out->getData(), channels_, imgPixels_, batchSize);
-#endif
-  } else {
-    for (size_t i = 0; i < batchSize; i++) {
-      const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * channels_ * imgPixels_,
-                         imgPixels_,
-                         channels_,
-                         false,
-                         useGpu_);
-      MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         channels_,
-                         imgPixels_,
-                         useGpu_);
-      inTmp->transpose(outTmp, false);
-    }
-  }
-}
-
-void BatchNormalizationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInputValue(0)->getHeight();
-  calFeatureMapSize();
-  resetOutput(batchSize, getInputValue(0)->getWidth());
-
-  // for testing in training peroid.
-  useGlobalStats_ = (passType == PASS_TEST);
-  if (passType == PASS_TEST && config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-
-  Matrix::resizeOrCreate(
-      expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      normIn_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_);
-  expandMat(getInputValue(0), expandedIn_);
-
-  if (useGlobalStats_) {
-    if (firstTest_) {
-      setMeanAndStd();
-      firstTest_ = false;
-    }
-  } else {
-    calMeanAndStd(expandedIn_);
-    firstTest_ = true;
-  }
-
-  normIn_->assign(*expandedIn_);
-  normIn_->addBias(*savedMean_, -1);     // subtract mean.
-  normIn_->divRowVector(*savedInvVar_);  // divide std.
-
-  expandedOut_->assign(*normIn_);
-  expandedOut_->mulRowVector(*weight_->getW());  // multiple gamma.
-  if (biases_) {
-    expandedOut_->addBias(*(biases_->getW()), 1);  // add beta.
-  }
-  MatrixPtr out = getOutputValue();
-  shrinkMat(expandedOut_, out);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-  int batchSize = getInputValue(0)->getHeight();
-
-  Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
-
-  Matrix::resizeOrCreate(
-      expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-
-  expandMat(getOutputGrad(), expandedOutGrad_);
-
-  // compute derivatives.
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*expandedOutGrad_, 1);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  if (weight_->getWGrad()) {
-    tmpMat_->dotMul(*expandedOutGrad_, *normIn_);
-    weight_->getWGrad()->collectBias(*tmpMat_, 1);
-  }
-
-  // compute input gradients.
-  normInGrad_->assign(*expandedOutGrad_);
-  normInGrad_->mulRowVector(*(weight_->getW()));  // multiple gamma.
-  // normInGrad * (x - \mu)/ \sqrt(\delta^2)
-  tmpMat_->dotMul(*normInGrad_, *normIn_);
-  stdGrad_->zeroMem();
-  stdGrad_->collectBias(*tmpMat_, -1.0 / (batchSize * imgPixels_));
-  tmpGrad_->assign(*normIn_);
-  tmpGrad_->mulRowVector(*stdGrad_);
-
-  meanGrad_->zeroMem();
-  meanGrad_->collectBias(*normInGrad_, -1.0 / (batchSize * imgPixels_));
-
-  expandedInGrad_->zeroMem();
-  expandedInGrad_->add(*normInGrad_, *tmpGrad_);
-  expandedInGrad_->addRowVector(*meanGrad_);
-  expandedInGrad_->divRowVector(*savedInvVar_);
-
-  shrinkMat(expandedInGrad_, inGrad_);
-  if (getInputGrad(0)) {
-    getInputGrad(0)->add(*getInputGrad(0), *inGrad_);
-  }
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormalizationLayer.h b/paddle/legacy/gserver/layers/BatchNormalizationLayer.h
deleted file mode 100644
index e5e4e690b..000000000
--- a/paddle/legacy/gserver/layers/BatchNormalizationLayer.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "BatchNormBaseLayer.h"
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * @brief A Inheritance class of Batch normalization layer.
- * It supports both CPU and GPU.
- *
- * The config file api is batch_norm_layer.
- */
-
-class BatchNormalizationLayer : public BatchNormBaseLayer {
- public:
-  explicit BatchNormalizationLayer(const LayerConfig& config)
-      : BatchNormBaseLayer(config), firstTest_(true) {}
-
-  ~BatchNormalizationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  /// Load pre-calculated mean and std.
-  void setMeanAndStd();
-
-  /// Calculate mean and std.
-  void calMeanAndStd(const MatrixPtr& mat);
-
-  /// Calculate moving mean and variance.
-  void calMovingMeanAndVar();
-
-  /// expand a Matrix from batch, channels* imagePixels to
-  /// batch * ImagePixels * channels.
-  void expandMat(const MatrixPtr& in, MatrixPtr& out);
-
-  /// Shrink a Matrix from  from batch * ImagePixels * channels
-  /// to batch, channels* imagePixels.
-  void shrinkMat(const MatrixPtr& in, MatrixPtr& out);
-
-  void onPassEnd() override { firstTest_ = true; }
-
-  MatrixPtr tmpMat_, tmpGrad_;
-  MatrixPtr expandedIn_, expandedOut_;
-  MatrixPtr expandedInGrad_, expandedOutGrad_, inGrad_;
-  MatrixPtr normIn_, normInGrad_, meanGrad_, stdGrad_;
-
-  /// Load mean and variance only once flag.
-  bool firstTest_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp b/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
deleted file mode 100644
index a091f51bc..000000000
--- a/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BilinearInterpLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(bilinear_interp, BilinearInterpLayer);
-
-size_t BilinearInterpLayer::getSize() {
-  inImgH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  inImgW_ = inputLayers_[0]->getOutput().getFrameWidth();
-
-  const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
-  if (inImgH_ == 0) {
-    inImgH_ = conf.image_conf().img_size_y();
-  }
-  if (inImgW_ == 0) {
-    inImgW_ = conf.image_conf().img_size();
-  }
-
-  outImgH_ = conf.out_size_y();
-  outImgW_ = conf.out_size_x();
-  numChannels_ = conf.image_conf().channels();
-
-  CHECK(outImgH_ > 0 && outImgW_ > 0);
-  CHECK(inImgH_ > 0 && inImgW_ > 0);
-  CHECK(numChannels_);
-
-  ratioH_ =
-      (outImgH_ > 1) ? static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
-  ratioW_ =
-      (outImgW_ > 1) ? static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
-
-  getOutput().setFrameHeight(outImgH_);
-  getOutput().setFrameWidth(outImgW_);
-  return outImgH_ * outImgW_ * numChannels_;
-}
-
-bool BilinearInterpLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(1, config_.inputs_size());
-
-  return true;
-}
-
-void BilinearInterpLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = getInput(0).getBatchSize();
-  size_t size = getSize();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, size);
-  }
-
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str());
-    outV->bilinearForward(*inV,
-                          inImgH_,
-                          inImgW_,
-                          outImgH_,
-                          outImgW_,
-                          numChannels_,
-                          ratioH_,
-                          ratioW_);
-  }
-}
-
-void BilinearInterpLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr inputG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-  {
-    REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str());
-    if (inputG) {
-      inputG->bilinearBackward(*outG,
-                               outImgH_,
-                               outImgW_,
-                               inImgH_,
-                               inImgW_,
-                               numChannels_,
-                               ratioH_,
-                               ratioW_);
-    }
-  }
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.h b/paddle/legacy/gserver/layers/BilinearInterpLayer.h
deleted file mode 100644
index c585a5ed1..000000000
--- a/paddle/legacy/gserver/layers/BilinearInterpLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for bilinear interpolation which is
- *        used on conv layer output.
- *
- * @note  The config file api is bilinear_interp_layer.
- */
-class BilinearInterpLayer : public Layer {
- protected:
-  size_t outImgH_, outImgW_;
-  size_t inImgH_, inImgW_;
-  real ratioH_, ratioW_;
-  size_t numChannels_;
-
- public:
-  explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~BilinearInterpLayer() {}
-
-  size_t getSize();
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.cpp b/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
deleted file mode 100644
index 24b5af67d..000000000
--- a/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BlockExpandLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-REGISTER_LAYER(blockexpand, BlockExpandLayer);
-
-bool BlockExpandLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(config_.inputs_size(), 1);
-  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
-  blockH_ = blockConf.block_y();
-  blockW_ = blockConf.block_x();
-  strideH_ = blockConf.stride_y();
-  strideW_ = blockConf.stride_x();
-  paddingH_ = blockConf.padding_y();
-  paddingW_ = blockConf.padding_x();
-  channels_ = blockConf.channels();
-  imgSizeH_ = blockConf.img_size_y();
-  imgSizeW_ = blockConf.img_size_x();
-
-  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
-  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
-  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
-  createFunction(forward_,
-                 "BlockExpand",
-                 FuncConfig()
-                     .set("strides", strides)
-                     .set("paddings", paddings)
-                     .set("blocks", blocks));
-  createFunction(backward_,
-                 "BlockExpandGrad",
-                 FuncConfig()
-                     .set("strides", strides)
-                     .set("paddings", paddings)
-                     .set("blocks", blocks));
-
-  return true;
-}
-
-size_t BlockExpandLayer::getBlockNum() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = blockConf.img_size_y();
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = blockConf.img_size_x();
-  }
-  size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_;
-  outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
-  size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
-  outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
-
-  return outputH_ * outputW_;
-}
-
-void BlockExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  size_t blockNum = getBlockNum();
-  size_t blockSize = blockH_ * blockW_ * channels_;
-  resetOutput(blockNum * batchSize, blockSize);
-
-  // calculate output_.value
-  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inputShape_);
-  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-
-  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
-  Argument& out = getOutput();
-  ICpuGpuVector::resizeOrCreate(
-      out.sequenceStartPositions, batchSize + 1, false);
-  IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
-  int* start = out.sequenceStartPositions->getMutableData(false);
-  int* dims = out.cpuSequenceDims->getData();
-  for (size_t i = 0; i < batchSize; i++) {
-    start[i] = i * blockNum;
-    dims[2 * i] = outputH_;
-    dims[2 * i + 1] = outputW_;
-  }
-  start[batchSize] = batchSize * blockNum;
-}
-
-void BlockExpandLayer::backward(const UpdateCallback& callback) {
-  /* Calculate the input layers error */
-  if (getInputGrad(0)) {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getOutputGrad(), outputShape_);
-    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.h b/paddle/legacy/gserver/layers/BlockExpandLayer.h
deleted file mode 100644
index 8b90249bf..000000000
--- a/paddle/legacy/gserver/layers/BlockExpandLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Expand feature map to minibatch matrix.
- * - matrix width is: blockH_ * blockW_ * channels_
- * - matirx height is: outputH_ * outputW_
- *
- * \f[
- * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
- *             strideH\_ \\
- * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
- *             strideW\_
- * \f]
- *
- * The expand method is the same with ExpandConvLayer, but saved the transposed
- * value. After expanding, output_.sequenceStartPositions will store timeline.
- * The number of time steps are outputH_ * outputW_ and the dimension of each
- * time step is blockH_ * blockW_ * channels_. This layer can be used after
- * convolution neural network, and before recurrent neural network.
- *
- * The config file api is block_expand_layer.
- */
-class BlockExpandLayer : public Layer {
- protected:
-  /**
-   * @brief Calculate outputH_ and outputW_ and return block number which
-   * actually is time steps.
-   * @return time steps, outoutH_ * outputW_.
-   */
-  size_t getBlockNum();
-  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
-  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
-
-  TensorShape inputShape_;
-  TensorShape outputShape_;
-
- public:
-  explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~BlockExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CRFDecodingLayer.cpp b/paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
deleted file mode 100644
index 4afed7e29..000000000
--- a/paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CRFDecodingLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(crf_decoding, CRFDecodingLayer);
-
-bool CRFDecodingLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  if (!CRFLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  crf_.reset(new LinearChainCRF(
-      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
-  return true;
-}
-
-void CRFDecodingLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  CHECK(!useGpu_) << "GPU is not supported";
-
-  const Argument& output = getInput(0);
-  CHECK(output.sequenceStartPositions);
-
-  size_t batchSize = output.getBatchSize();
-  size_t numSequences = output.sequenceStartPositions->getSize() - 1;
-
-  IVector::resizeOrCreate(output_.ids, batchSize, useGpu_);
-  const int* starts = output.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], (int)batchSize);
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    crf_->decode(output.value->getData() + numClasses_ * starts[i],
-                 output_.ids->getData() + starts[i],
-                 starts[i + 1] - starts[i]);
-  }
-
-  if (inputLayers_.size() == 2) {
-    const Argument& label = getInput(1);
-    resizeOutput(batchSize, 1);
-    CHECK(label.ids);
-    real* error = output_.value->getData();
-    int* ids = label.ids->getData();
-    int* result = output_.ids->getData();
-    for (size_t i = 0; i < batchSize; ++i) {
-      error[i] = ids[i] == result[i] ? 0 : 1;
-    }
-  }
-}
-
-void CRFDecodingLayer::backward(const UpdateCallback& callback) {
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CRFDecodingLayer.h b/paddle/legacy/gserver/layers/CRFDecodingLayer.h
deleted file mode 100644
index 018162e14..000000000
--- a/paddle/legacy/gserver/layers/CRFDecodingLayer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "CRFLayer.h"
-#include "LinearChainCRF.h"
-
-namespace paddle {
-
-/**
- * A layer for calculating the decoding sequence of sequential conditional
- * random field model.
- * The decoding sequence is stored in output_.ids
- * It also calculate error, output_.value[i] is 1 for incorrect decoding
- * or 0 for correct decoding)
- * See LinearChainCRF.h for the detail of the CRF formulation.
- */
-class CRFDecodingLayer : public CRFLayer {
- public:
-  explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  std::unique_ptr<LinearChainCRF> crf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CRFLayer.cpp b/paddle/legacy/gserver/layers/CRFLayer.cpp
deleted file mode 100644
index 8b87a533a..000000000
--- a/paddle/legacy/gserver/layers/CRFLayer.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CRFLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(crf, CRFLayer);
-
-bool CRFLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  if (config_.type() == "crf") {
-    CHECK_GE(inputLayers_.size(), 2UL);
-    // the third output is sequence weight. one weight for each sequence
-    CHECK_LE(inputLayers_.size(), 3UL);
-  }
-
-  // coeff only affect bp, keep consistent with CostLayer
-  coeff_ = config_.coeff();
-  if (inputLayers_.size() == 3) {
-    weightLayer_ = inputLayers_[2];
-  }
-
-  numClasses_ = inputLayers_[0]->getSize();
-
-  CHECK_GE(numClasses_, 2UL);
-
-  CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2));
-
-  parameter_ = parameters_[0];
-  weight_.reset(new Weight(numClasses_ + 2, numClasses_, parameter_));
-
-  // We don't need sequenceStartPositions because each sample of output_ is
-  // for the cost of one sequence.
-  setNeedSequenceInfo(false);
-
-  return true;
-}
-
-void CRFLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  CHECK(!useGpu_) << "GPU is not supported";
-
-  const Argument& output = getInput(0);
-  const Argument& label = getInput(1);
-  CHECK(label.sequenceStartPositions);
-  CHECK(label.ids);
-
-  int batchSize = output.getBatchSize();
-  size_t numSequences = label.sequenceStartPositions->getSize() - 1;
-  resizeOutput(numSequences, 1);
-
-  const int* starts = label.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    if (i >= crfs_.size()) {
-      crfs_.emplace_back(numClasses_, weight_->getW()->getData());
-    }
-    output_.value->getData()[i] =
-        crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
-                         label.ids->getData() + starts[i],
-                         starts[i + 1] - starts[i]);
-  }
-
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    getOutputValue()->dotMul(*getOutputValue(), *weight);
-  }
-}
-
-void CRFLayer::backward(const UpdateCallback& callback) {
-  const Argument& output = getInput(0);
-  const Argument& label = getInput(1);
-  const int* starts = label.sequenceStartPositions->getData(false);
-  int numSequences = label.sequenceStartPositions->getSize() - 1;
-
-  bool needWGrad = weight_->getWGrad() ? true : false;
-  for (int i = 0; i < numSequences; ++i) {
-    crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
-                      label.ids->getData() + starts[i],
-                      starts[i + 1] - starts[i],
-                      needWGrad);
-    real instanceWeight = weightLayer_
-                              ? getInputValue(*weightLayer_)->getElement(i, 0)
-                              : real(1.0f);
-    instanceWeight *= coeff_;
-
-    if (output.grad) {
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
-      grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
-    }
-    if (needWGrad) {
-      weight_->getWGrad()->add(
-          *crfs_[i].getWGrad(), real(1.0f), instanceWeight);
-    }
-  }
-
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CRFLayer.h b/paddle/legacy/gserver/layers/CRFLayer.h
deleted file mode 100644
index 88c2ed343..000000000
--- a/paddle/legacy/gserver/layers/CRFLayer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "Layer.h"
-#include "LinearChainCRF.h"
-
-namespace paddle {
-
-/**
- * A layer for calculating the cost of sequential conditional random field
- * model.
- * See class LinearChainCRF for the detail of the CRF formulation.
- */
-class CRFLayer : public Layer {
- public:
-  explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  size_t numClasses_;
-  ParameterPtr parameter_;
-  std::vector<LinearChainCRF> crfs_;
-  LayerPtr weightLayer_;            // weight for each sequence
-  std::unique_ptr<Weight> weight_;  // parameters
-  real coeff_;                      // weight for the layer
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CTCLayer.cpp b/paddle/legacy/gserver/layers/CTCLayer.cpp
deleted file mode 100644
index 64eb15cd0..000000000
--- a/paddle/legacy/gserver/layers/CTCLayer.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CTCLayer.h"
-
-/* Please reference the Chapter7  in
- * "Alex graves, Supervised Sequence Labelling with
- * Recurrent Neural Networks" */
-namespace paddle {
-REGISTER_LAYER(ctc, CTCLayer);
-
-bool CTCLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2UL);
-
-  /* The inputLayers_[0] must be softmax output */
-  numClasses_ = inputLayers_[0]->getSize();
-  normByTimes_ = config_.norm_by_times();
-  CHECK_GE(numClasses_, 2UL);
-
-  // We don't need sequenceStartPositions because each sample of output_ is
-  // for the cost of one sequence.
-  setNeedSequenceInfo(false);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
-  return true;
-}
-
-void CTCLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(
-          getInput(i), false, HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1]);
-  } else {
-    forwardImp(getInput(0), getInput(1));
-  }
-}
-
-void CTCLayer::forwardImp(const Argument& softmaxSeqs,
-                          const Argument& labelSeqs) {
-  CHECK(softmaxSeqs.sequenceStartPositions);
-  CHECK(labelSeqs.sequenceStartPositions);
-  CHECK(labelSeqs.ids);
-
-  size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1;
-  CHECK_EQ(numSequences, softmaxSeqs.sequenceStartPositions->getSize() - 1);
-
-  resizeOutput(numSequences, 1);
-  std::vector<real> out(numSequences);
-
-  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
-  const int* softmaxSeqsStarts =
-      softmaxSeqs.sequenceStartPositions->getData(false);
-
-  for (size_t i = 0; i < numSequences; i++) {
-    if (i >= ctcs_.size()) {
-      ctcs_.emplace_back(numClasses_, normByTimes_);
-    }
-    out[i] = ctcs_[i].forward(
-        softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
-        softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i],
-        labelSeqs.ids->getData() + labelSeqsStarts[i],
-        labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
-  }
-  output_.value->copyFrom(out.data(), numSequences);
-}
-
-void CTCLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (useGpu_) {
-    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
-    const_cast<Argument&>(getInput(0))
-        .resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT);
-    const_cast<Argument&>(getInput(1))
-        .resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT);
-  } else {
-    backwardImp(callback, getInput(0), getInput(1));
-  }
-}
-
-void CTCLayer::backwardImp(const UpdateCallback& callback,
-                           const Argument& softmaxSeqs,
-                           const Argument& labelSeqs) {
-  size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1;
-
-  const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false);
-  const int* softmaxSeqsStarts =
-      softmaxSeqs.sequenceStartPositions->getData(false);
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    ctcs_[i].backward(
-        softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i],
-        softmaxSeqs.grad->getData() + numClasses_ * softmaxSeqsStarts[i],
-        labelSeqs.ids->getData() + labelSeqsStarts[i],
-        labelSeqsStarts[i + 1] - labelSeqsStarts[i]);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CTCLayer.h b/paddle/legacy/gserver/layers/CTCLayer.h
deleted file mode 100644
index 5d70b1f4c..000000000
--- a/paddle/legacy/gserver/layers/CTCLayer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "LinearChainCTC.h"
-
-namespace paddle {
-
-class CTCLayer : public Layer {
- public:
-  explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
-  void backward(const UpdateCallback& callback) override;
-  void backwardImp(const UpdateCallback& callback,
-                   const Argument& softmaxSeqs,
-                   const Argument& labelSeqs);
-
- protected:
-  size_t numClasses_;
-  bool normByTimes_;
-  std::vector<LinearChainCTC> ctcs_;
-  std::vector<Argument> tmpCpuInput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ClipLayer.cpp b/paddle/legacy/gserver/layers/ClipLayer.cpp
deleted file mode 100644
index 6aa3c8fe6..000000000
--- a/paddle/legacy/gserver/layers/ClipLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer for clipping the input value by the threshold.
- * \f[
- *   out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
- * \f]
- */
-
-class ClipLayer : public Layer {
- protected:
-  double min_;
-  double max_;
-
- public:
-  explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(clip, ClipLayer);
-
-bool ClipLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-  auto layerConf = config_.inputs(0).clip_conf();
-  min_ = layerConf.min();
-  max_ = layerConf.max();
-  CHECK_LT(min_, max_);
-  return true;
-}
-
-void ClipLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-  resetOutput(inV->getHeight(), inV->getWidth());
-  MatrixPtr outV = getOutputValue();
-  outV->copyFrom(*inV);
-  outV->clip(min_, max_);
-}
-
-void ClipLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  if (inG) {
-    MatrixPtr outV = getOutputValue();
-    MatrixPtr outG = getOutputGrad();
-    MatrixPtr tmpMtx;
-    Matrix::resizeOrCreate(
-        tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
-    tmpMtx->clipDerivative(*inV, min_, max_);
-    inG->addDotMul(*outG, *tmpMtx, 1, 1);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConcatenateLayer.cpp b/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
deleted file mode 100644
index ce3f2ca95..000000000
--- a/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "Projection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A concatenate layer has multiple input layers. It concatenates rows of
- * each input as one row for the output of this layer and apply activation.
- */
-class ConcatenateLayer : public Layer {
- public:
-  explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConcatenateLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(concat, ConcatenateLayer);
-
-bool ConcatenateLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK(!biasParameter_);
-
-  return true;
-}
-
-void ConcatenateLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-
-  const MatrixPtr& out = getOutputValue();
-  int offset = 0;
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr& in = getInputValue(i);
-    size_t inSize = in->getWidth();
-    out->assignAtOffset(*in, offset);
-    offset += inSize;
-  }
-  CHECK_EQ(size, offset);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void ConcatenateLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  const MatrixPtr& out = getOutputGrad();
-  int offset = 0;
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr& in = getInputGrad(i);
-    size_t inSize = getInputValue(i)->getWidth();
-    if (in) {
-      in->addAtOffset(*out, offset);
-    }
-    offset += inSize;
-  }
-}
-
-/**
- * concat2 layer is like concat layer, but each input layer was
- * processed by a Projection.
- */
-class ConcatenateLayer2 : public Layer {
- public:
-  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
-
-  ~ConcatenateLayer2() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  std::vector<std::unique_ptr<Projection>> projections_;
-  std::vector<Argument> projOutput_;
-  std::vector<std::pair<size_t, size_t>> projCol_;
-  bool sharedBias_;
-  std::unique_ptr<Weight> biases_;
-};
-
-REGISTER_LAYER(concat2, ConcatenateLayer2);
-
-bool ConcatenateLayer2::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.reserve(inputLayers_.size());
-  projCol_.reserve(inputLayers_.size());
-  projOutput_.resize(inputLayers_.size());
-
-  size_t startCol = 0;
-  size_t endCol = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    projections_.emplace_back(Projection::create(
-        config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
-
-    endCol += projections_[i]->getOutputSize();
-    projCol_.push_back(std::make_pair(startCol, endCol));
-    startCol = endCol;
-  }
-  CHECK_EQ(getSize(), endCol);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    sharedBias_ = config_.shared_biases();
-    size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
-  }
-
-  return true;
-}
-
-void ConcatenateLayer2::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  for (size_t i = 0; i < projections_.size(); i++) {
-    size_t startCol = projCol_[i].first;
-    size_t endCol = projCol_[i].second;
-    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
-    if (output_.grad) {
-      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
-    }
-  }
-
-  {
-    AsyncGpuBlock block;
-    for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
-    }
-  }
-
-  /* add the bias-vector */
-  if (biases_) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void ConcatenateLayer2::backward(const UpdateCallback& callback) {
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  AsyncGpuBlock block;
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->backward(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ContextProjection.cpp b/paddle/legacy/gserver/layers/ContextProjection.cpp
deleted file mode 100644
index 8bcf32663..000000000
--- a/paddle/legacy/gserver/layers/ContextProjection.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(context, ContextProjection);
-
-ContextProjection::ContextProjection(const ProjectionConfig& config,
-                                     ParameterPtr parameter,
-                                     bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(config.has_context_start());
-  CHECK(config.has_context_length());
-  if (config.context_start() == 0 && config.context_length() == 1) {
-    config_.set_trainable_padding(false);
-  }
-  if (config_.trainable_padding()) {
-    CHECK(parameter);
-    beginPad_ = std::max(0, -config.context_start());
-    endPad_ = std::max(0, config.context_start() + config.context_length() - 1);
-    size_t totalPad = beginPad_ + endPad_;
-    size_t inputDim = parameter->getSize() / totalPad;
-    CHECK_EQ(config.input_size(), inputDim);
-    CHECK_EQ(inputDim * totalPad, parameter->getSize());
-    weight_.reset(new Weight(totalPad, inputDim, parameter));
-  }
-  // init forward_ and backward_ functions
-  init();
-}
-
-bool ContextProjection::init() {
-  size_t context_length = config_.context_length();
-  int context_start = config_.context_start();
-  bool is_padding = config_.trainable_padding();
-  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
-
-  createFunction(forward_,
-                 "ContextProjectionForward",
-                 FuncConfig()
-                     .set("context_length", context_length)
-                     .set("context_start", context_start)
-                     .set("begin_pad", beginPad_));
-  createFunction(backward_,
-                 "ContextProjectionBackward",
-                 FuncConfig()
-                     .set("context_length", context_length)
-                     .set("context_start", context_start)
-                     .set("begin_pad", beginPad_)
-                     .set("is_padding", is_padding)
-                     .set("total_pad", total_pad));
-
-  return true;
-}
-
-void ContextProjection::resetState() {
-  CHECK_LE(config_.context_start() + config_.context_length(), 1)
-      << "state is not allowed for future context";
-  if (config_.context_start() >= 0) return;
-  Matrix::resizeOrCreate(state_,
-                         -config_.context_start(),
-                         config_.input_size(),
-                         false,  // trans
-                         useGpu_);
-  Matrix::resizeOrCreate(state2_,
-                         -config_.context_start(),
-                         config_.input_size(),
-                         false,  // trans
-                         useGpu_);
-  if (config_.trainable_padding()) {
-    state_->assign(*weight_->getW()->subMatrix(0, -config_.context_start()));
-  } else {
-    state_->zeroMem();
-  }
-}
-
-void ContextProjection::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1)
-      << "one matrix is expected for ContextProjection state";
-  state_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr ContextProjection::getState() {
-  if (state_ == nullptr) {
-    return nullptr;
-  }
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(state_->clone(0, 0, false));
-  res->value[0]->copyFrom(*state_);
-  return res;
-}
-
-void ContextProjection::forward() {
-  CHECK(in_->value && out_->value);
-  CHECK(in_->sequenceStartPositions);
-
-  size_t input_dim = in_->value->getWidth();
-  size_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, input_dim * config_.context_length());
-  // size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
-
-  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool is_padding = config_.trainable_padding();
-  /// first use state_, otherwise use weight_(padding false === w nullptr)
-  auto w_ptr =
-      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*in_->value, *start_pos);
-  if (w_ptr) {
-    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
-                  *start_pos);
-  }
-  outputs.addArg(*out_->value, *start_pos, ADD_TO);
-  forward_[0]->calc(inputs, outputs);
-
-  if (state_ && config_.context_start() < 0) {
-    CHECK_EQ(1, in_->getNumSequences());
-    const int* starts = in_->sequenceStartPositions->getData(false);
-    int length = starts[1] - starts[0];
-    if (-config_.context_start() <= length) {
-      MatrixPtr sub = in_->value->subMatrix(starts[1] + config_.context_start(),
-                                            -config_.context_start());
-      state_->copyFrom(*sub);
-    } else {
-      int prevLength = -config_.context_start() - length;
-      state2_->subMatrix(0, prevLength)
-          ->copyFrom(*state_->subMatrix(length, prevLength));
-      state2_->subMatrix(prevLength, length)
-          ->copyFrom(*in_->value->subMatrix(starts[0], length));
-      std::swap(state_, state2_);
-    }
-  }
-}
-
-void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value && out_->value && out_->grad);
-  size_t input_dim = in_->value->getWidth();
-  size_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, input_dim * config_.context_length());
-  size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(batch_size, out_->value->getHeight());
-  CHECK_EQ(static_cast<int>(backward_.size()), 1)
-      << "Only one backward function here";
-
-  REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool is_padding = config_.trainable_padding();
-  auto start_pos = in_->sequenceStartPositions;
-  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(
-      CpuMatrix(
-          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
-      *in_->sequenceStartPositions->getVector(useGpu_),
-      ADD_TO);
-  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                           w_ptr ? w_ptr->getHeight() : 0,
-                           input_dim),
-                 ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-
-  if (config_.trainable_padding()) {
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ContextProjection.h b/paddle/legacy/gserver/layers/ContextProjection.h
deleted file mode 100644
index 9c2171454..000000000
--- a/paddle/legacy/gserver/layers/ContextProjection.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * @brief Context projection concatenate features in adjacent time steps in
- * a sequence. The i-th row of the output is the concatenation of
- * context_length rows of the input. The context_length rows are the
- * consecutive rows from the i+shift_start row.
- *
- * For example, assumed input (x) has 4 words and the dimension of each word
- * representation is 2. If we use zero to pad instead of learned weight to pad,
- * and the context_lenth is 3, the output (y) is:
- *
- * @code
- *  x = [a1, a2;
- *       b1, b2;
- *       c1, c2;
- *       d1, d2]
- *  y = [0,  0,  a1, a2, b1, b2;
- *       a1, a2, b1, b2, c1, c2;
- *       b1, b2, c1, c2, d1, d2;
- *       c1, c2, d1, d2, 0,  0]
- * @endcode
- *
- * The config file api is context_projection.
- */
-class ContextProjection : public Projection {
- public:
-  /**
-   * Constructor. If context_start is zero and context_lenth is one, it will
-   * set trainable_padding false. trainable_padding is an optional arguments
-   * and if it is set, constructor will set learned weight, which is used to
-   * pad output.
-   */
-  ContextProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
-  virtual void resetState();
-
-  virtual void setState(LayerStatePtr state);
-
-  virtual LayerStatePtr getState();
-
-  virtual bool init();
-
- protected:
-  std::unique_ptr<Weight> weight_;
-  /// number of extra timesteps added at the beginning
-  size_t beginPad_;
-  /// number of extra timesteps added at the end
-  size_t endPad_;
-  /// state_ and state2_ are used in sequence generating and saved
-  /// previous inputs.
-  MatrixPtr state_;
-  MatrixPtr state2_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.cpp b/paddle/legacy/gserver/layers/Conv3DLayer.cpp
deleted file mode 100644
index d072a7423..000000000
--- a/paddle/legacy/gserver/layers/Conv3DLayer.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Conv3DLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(conv3d, Conv3DLayer);
-
-bool Conv3DLayer::init(const LayerMap &layerMap,
-                       const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  int index = 0;
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    M_.push_back(numFilters_ / conf.groups());
-    K_.push_back(filterPixels_[index] * filterChannels_[index]);
-
-    // create a new weight
-    size_t height, width;
-    width = filterPixels_[index] * filterChannels_[index];
-    height = numFilters_;
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-    ++index;
-  }
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  return true;
-}
-
-size_t Conv3DLayer::getSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  outputH_.clear();
-  outputW_.clear();
-  outputD_.clear();
-  N_.clear();
-  size_t layerSize = 0;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    outputW_.push_back(outputSize(
-        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
-    outputH_.push_back(outputSize(
-        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
-    outputD_.push_back(outputSize(
-        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
-
-    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
-    layerSize += N_[i] * numFilters_;
-  }
-  getOutput().setFrameHeight(outputH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
-  getOutput().setFrameDepth(outputD_[0]);
-  return layerSize;
-}
-
-void Conv3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-
-  REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr &inMat = getInputValue(i);
-    const MatrixPtr &outMat = getOutputValue();
-    int M = M_[i];
-    int N = N_[i];
-    int K = K_[i];
-    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-    MatrixPtr wMat = weights_[i]->getW();
-    for (int n = 0; n < batchSize; ++n) {
-      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
-                       channels_[i],
-                       imgSizeD_[i],
-                       imgSizeH_[i],
-                       imgSizeW_[i],
-                       filterSizeZ_[i],
-                       filterSizeY_[i],
-                       filterSize_[i],
-                       strideZ_[i],
-                       strideY_[i],
-                       stride_[i],
-                       paddingZ_[i],
-                       paddingY_[i],
-                       padding_[i]);
-
-      real *outData = outMat->getData() + n * outMat->getStride();
-      MatrixPtr outMatSub =
-          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
-      for (int g = 0; g < groups_[i]; g++) {
-        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
-        MatrixPtr in = colBuf_->subMatrix(g * K, K);
-        MatrixPtr out = outMatSub->subMatrix(g * M, M);
-        out->mul(*wMatSub, *in, 1.0, 1.0);
-      }
-    }
-  }
-  if (nullptr != this->biasParameter_) {
-    this->addBias();
-  }
-  forwardActivation();
-}
-
-void Conv3DLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases();
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (weights_[i]->getWGrad()) {
-      bpropWeights(i);
-    }
-    if (getInputGrad(i)) {
-      bpropData(i);
-    }
-    weights_[i]->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void Conv3DLayer::bpropWeights(int i) {
-  int M = M_[i];
-  int N = N_[i];
-  int K = K_[i];
-  const MatrixPtr &inMat = getInputValue(i);
-  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-  MatrixPtr wGradMat = weights_[i]->getWGrad();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  for (int n = 0; n < batchSize; ++n) {
-    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
-                     channels_[i],
-                     imgSizeD_[i],
-                     imgSizeH_[i],
-                     imgSizeW_[i],
-                     filterSizeZ_[i],
-                     filterSizeY_[i],
-                     filterSize_[i],
-                     strideZ_[i],
-                     strideY_[i],
-                     stride_[i],
-                     paddingZ_[i],
-                     paddingY_[i],
-                     padding_[i]);
-
-    real *outGradData =
-        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
-    MatrixPtr outGradSub =
-        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
-    for (int g = 0; g < groups_[i]; ++g) {
-      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
-      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
-      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
-      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
-    }
-  }
-}
-
-void Conv3DLayer::bpropData(int i) {
-  int M = M_[i];
-  int N = N_[i];
-  int K = K_[i];
-  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-  MatrixPtr wMat = weights_[i]->getW();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  for (int n = 0; n < batchSize; ++n) {
-    real *outGradData =
-        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
-    real *preGradData =
-        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
-    MatrixPtr outGradSub =
-        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
-    for (int g = 0; g < groups_[i]; ++g) {
-      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
-      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
-      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
-      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
-    }
-    colBuf_->col2Vol(preGradData,
-                     channels_[i],
-                     imgSizeD_[i],
-                     imgSizeH_[i],
-                     imgSizeW_[i],
-                     filterSizeZ_[i],
-                     filterSizeY_[i],
-                     filterSize_[i],
-                     strideZ_[i],
-                     strideY_[i],
-                     stride_[i],
-                     paddingZ_[i],
-                     paddingY_[i],
-                     padding_[i],
-                     1.0,
-                     1.0);
-  }
-}
-
-void Conv3DLayer::bpropBiases() {
-  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
-                                    1,
-                                    biases_->getWGrad()->getElementCnt(),
-                                    false,
-                                    useGpu_);
-  MatrixPtr outGradMat = getOutputGrad();
-
-  if (this->sharedBiases_) {
-    biases->collectSharedBias(*outGradMat, 1.0f);
-  } else {
-    biases->collectBias(*outGradMat, 1.0f);
-  }
-}
-
-void Conv3DLayer::addBias() {
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  if (this->sharedBiases_) {
-    outMat->addSharedBias(*(bias), 1.0f);
-  } else {
-    outMat->addBias(*(bias), 1.0f);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.h b/paddle/legacy/gserver/layers/Conv3DLayer.h
deleted file mode 100644
index cb42a2f36..000000000
--- a/paddle/legacy/gserver/layers/Conv3DLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- */
-class Conv3DLayer : public ConvBaseLayer {
- public:
-  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~Conv3DLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
- protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.cpp b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
deleted file mode 100644
index 76120915e..000000000
--- a/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Logging.h"
-namespace paddle {
-
-bool ConvBaseLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
-                  ? false
-                  : true;
-
-  /* Initialize the convolutional layer parameter */
-  numFilters_ = config_.num_filters();
-  sharedBiases_ = config_.shared_biases();
-  for (auto& inputConfig : config_.inputs()) {
-    const ConvConfig& conf = inputConfig.conv_conf();
-    padding_.push_back(conf.padding());
-    stride_.push_back(conf.stride());
-    dilation_.push_back(conf.dilation());
-    filterSize_.push_back(conf.filter_size());
-    paddingY_.push_back(conf.padding_y());
-    strideY_.push_back(conf.stride_y());
-    dilationY_.push_back(conf.dilation_y());
-    filterSizeY_.push_back(conf.filter_size_y());
-    channels_.push_back(conf.channels());
-    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
-                                              : conf.img_size());
-    imgSizeW_.push_back(conf.img_size());
-    groups_.push_back(conf.groups());
-    filterChannels_.push_back(conf.filter_channels());
-    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
-    outputW_.push_back(conf.output_x());
-
-    paddingZ_.push_back(conf.padding_z());
-    strideZ_.push_back(conf.stride_z());
-    filterSizeZ_.push_back(conf.filter_size_z());
-    imgSizeD_.push_back(conf.img_size_z());
-    outputD_.push_back(conf.output_z());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
-                            filterSizeZ_.back());
-  }
-
-  CHECK(inputLayers_.size() == parameters_.size());
-
-  // create new weights_ in derived class
-  // create new biases_ in derived class
-
-  // default caffe model
-  caffeMode_ = true;
-
-  return true;
-}
-
-size_t ConvBaseLayer::calOutputSize() {
-  auto clearAndReserve = [this](IntV* vec) {
-    vec->clear();
-    vec->reserve(this->inputLayers_.size());
-  };
-  clearAndReserve(&imgSizeH_);
-  clearAndReserve(&imgSizeW_);
-  clearAndReserve(&outputH_);
-  clearAndReserve(&outputW_);
-  size_t layerSize = 0;
-
-  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
-    size_t filterSizeY;
-    size_t filterSize;
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
-      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
-      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-      const ConvConfig& conf = config_.inputs(i).conv_conf();
-      if (isDeconv_) {
-        if (inH[i] == 0)
-          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
-        if (inW[i] == 0) inW[i] = conf.output_x();
-        outH.push_back(imageSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(
-            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      } else {
-        if (inH[i] == 0)
-          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-        if (inW[i] == 0) inW[i] = conf.img_size();
-        outH.push_back(outputSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(outputSize(
-            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      }
-      CHECK_EQ(outH[i], outH[0]);
-      CHECK_EQ(outW[i], outW[0]);
-    }
-    getOutput().setFrameHeight(outH[0]);
-    getOutput().setFrameWidth(outW[0]);
-    layerSize = outH[0] * outW[0] * size_t(numFilters_);
-  };
-
-  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-
-  return layerSize;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.h b/paddle/legacy/gserver/layers/ConvBaseLayer.h
deleted file mode 100644
index 01e90e999..000000000
--- a/paddle/legacy/gserver/layers/ConvBaseLayer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/MathUtils.h"
-namespace paddle {
-
-/**
- * @brief A Base Convolution Layer, which convolves the input image
- * with learned filters and (optionally) adds biases.
- */
-
-class ConvBaseLayer : public Layer {
- protected:
-  typedef std::vector<int> IntV;
-
-  /// True if it's deconv layer, false if it's convolution layer
-  bool isDeconv_;
-
-  /// The number of filters.
-  int numFilters_;
-  /// The x dimension of the padding.
-  IntV padding_;
-  /// The y dimension of the padding.
-  IntV paddingY_;
-  /// The x dimension of the stride.
-  IntV stride_;
-  /// The y dimension of the stride.
-  IntV strideY_;
-  /// The x dimension of the dilation.
-  IntV dilation_;
-  /// The y dimension of the dilation.
-  IntV dilationY_;
-  /// The x dimension of a filter kernel.
-  IntV filterSize_;
-  /// The y dimension of a filter kernel.
-  IntV filterSizeY_;
-  /// The spatial dimensions of the convolution input.
-  IntV channels_;
-  /// The spatial dimensions of input feature map height.
-  IntV imgSizeH_;
-  /// The spatial dimensions of input feature map width.
-  IntV imgSizeW_;
-  /// filterPixels_ = filterSizeX_ * filterSizeY_.
-  IntV filterPixels_;
-  /// filterChannels_ = channels_/groups_.
-  IntV filterChannels_;
-  /// The spatial dimensions of output feature map height.
-  IntV outputH_;
-  /// The spatial dimensions of output feature map width.
-  IntV outputW_;
-
-  IntV outputD_;
-  IntV imgSizeD_;
-  IntV filterSizeZ_;
-  IntV strideZ_;
-  IntV paddingZ_;
-
-  /// Group size, refer to grouped convolution in
-  /// Alex Krizhevsky's paper: when group=2, the first half of the
-  /// filters are only connected to the first half of the input channels,
-  /// and the second half only connected to the second half.
-  IntV groups_;
-  /// Whether the bias is shared for feature in each channel.
-  bool sharedBiases_;
-
-  /// shape of weight: (numChannels * filterPixels_, numFilters)
-  WeightList weights_;
-  /// If shared_biases is false shape of bias: (numFilters_, 1)
-  /// If shared_biases is ture shape of bias:
-  /// (numFilters_ * outputX * outputY, 1)
-  std::unique_ptr<Weight> biases_;
-
-  /// True by default. The only difference is the calculation
-  /// of output size.
-  bool caffeMode_;
-
- public:
-  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
-   * in this function. Then it will calculate outputH_ and outputW_ and set them
-   * into output argument.
-   */
-  virtual size_t calOutputSize();
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.cpp b/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
deleted file mode 100644
index e8e59b3bf..000000000
--- a/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvBaseOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK(useGpu);
-  CHECK_EQ(config_.input_indices_size(), 2L);
-
-  caffeMode_ = true;
-  getConvParams();
-  computeConvSizes();
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  workSpace_ = nullptr;
-
-  isSelectAlgo_ = false;
-}
-
-void ConvBaseOperator::allocConvWorkSpace() {
-  hl_conv_workspace(imageDesc_,
-                    outputDesc_,
-                    filterDesc_,
-                    convDesc_,
-                    &fwdAlgo_,
-                    &fwdLimitBytes_,
-                    &bwdDataAlgo_,
-                    &bwdDataLimitBytes_,
-                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_,
-                    /*useDilation*/ false);
-
-  size_t maxWorkSpace = 0;
-  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-
-  if (maxWorkSpace > workSpaceInBytes_) {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-    }
-    // total amount of storage needed
-    workSpace_ = hl_malloc_device(maxWorkSpace);
-    workSpaceInBytes_ = maxWorkSpace;
-  }
-}
-
-void ConvBaseOperator::computeConvSizes() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
-  hl_create_tensor_descriptor(&imageDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   imageDesc_,
-                                   filterDesc_,
-                                   paddingY_,
-                                   padding_,
-                                   strideY_,
-                                   stride_);
-}
-
-void ConvBaseOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(imageDesc_,
-                    1,
-                    channels_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_tensor_reshape(outputDesc_,
-                    1,
-                    numFilters_,
-                    outputH_,
-                    outputW_,
-                    numFilters_ * outputH_ * outputW_,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  imageDesc_,
-                                  filterDesc_,
-                                  paddingY_,
-                                  padding_,
-                                  strideY_,
-                                  stride_);
-}
-
-void ConvBaseOperator::getConvParams() {
-  configNumFilters_ = config_.num_filters();
-  const ConvConfig &conf = config_.conv_conf();
-  padding_ = conf.padding();
-  stride_ = conf.stride();
-  filterSize_ = conf.filter_size();
-  paddingY_ = conf.padding_y();
-  strideY_ = conf.stride_y();
-  filterSizeY_ = conf.filter_size_y();
-  filterPixels_ = filterSize_ * filterSizeY_;
-  configChannels_ = conf.channels();
-  imgSize_ = conf.img_size();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  imgPixels_ = imgSize_ * imgSizeY_;
-  CHECK_EQ(conf.groups(), 1U);
-  filterChannels_ = conf.filter_channels();
-  outputX_ = conf.output_x();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  outputs_ = outputX_ * outputX_;
-
-  isDeconv_ = (config_.type() == "conv") ? false : true;
-  if (isDeconv_) {
-    channels_ = configNumFilters_;
-    numFilters_ = configChannels_;
-  } else {
-    channels_ = configChannels_;
-    numFilters_ = configNumFilters_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.h b/paddle/legacy/gserver/layers/ConvBaseOperator.h
deleted file mode 100644
index 4ac77f2d7..000000000
--- a/paddle/legacy/gserver/layers/ConvBaseOperator.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "Operator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvBaseOperator : public Operator {
- public:
-  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvBaseOperator() {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-      workSpaceInBytes_ = 0;
-    }
-
-    hl_destroy_tensor_descriptor(imageDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-    hl_destroy_filter_descriptor(filterDesc_);
-    hl_destroy_convolution_descriptor(convDesc_);
-  }
-
- protected:
-  /**
-   * Get convolution parameters from layer config and
-   * initialize member variables.
-   */
-  void getConvParams();
-
-  /**
-   * Allocate Gpu Memory for cudnn convolution algorithms.
-   */
-  void allocConvWorkSpace();
-
-  /**
-   * Create cudnn tensor descriptor for convolution operation.
-   */
-  void computeConvSizes();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshapeImageDescriptors();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  virtual void reshape(int batchSize) = 0;
-
-  /**
-   * Check filter size is equal to the size calculated by parameters from
-   * layer config.
-   */
-  void checkFilterSize(const MatrixPtr &filter) {
-    CHECK_EQ(static_cast<int>(filter->getWidth()),
-             filterSize_ * filterSizeY_ * channels_ * numFilters_);
-  }
-
-  /// Most of member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  bool isDeconv_;
-  int imageH_, imageW_, outputH_, outputW_;
-  hl_tensor_descriptor imageDesc_;
-  hl_tensor_descriptor outputDesc_;
-  hl_filter_descriptor filterDesc_;
-  hl_convolution_descriptor convDesc_;
-  bool caffeMode_;
-  int inputOffset_, outputOffset_, weightOffset_;
-  int numFilters_, channels_;
-
-  /// from parsing config
-  int configNumFilters_, configChannels_;
-  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
-  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
-
-  /// Following member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
-  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
-  size_t workSpaceInBytes_;
-  void *workSpace_;
-  bool isSelectAlgo_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.cpp b/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
deleted file mode 100644
index ff5d3412d..000000000
--- a/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseProjection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
-
-ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
-                                       ParameterPtr parameter,
-                                       bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(useGpu);  // only support GPU
-  getConvParams();
-  initCudnn();
-
-  size_t height = filterH_ * filterW_ * channels_ / groups_;
-  size_t width = numFilters_;
-  weight_.reset(new Weight(height, width, parameter));
-  weightOffset_ = height * width / groups_;
-}
-
-void ConvBaseProjection::getConvParams() {
-  const ConvConfig &conf = config_.conv_conf();
-  paddingH_ = conf.padding_y();
-  paddingW_ = conf.padding();
-
-  strideH_ = conf.stride_y();
-  strideW_ = conf.stride();
-
-  dilationH_ = conf.dilation_y();
-  dilationW_ = conf.dilation();
-  CHECK_GT(dilationH_, 0);
-  CHECK_GT(dilationW_, 0);
-
-  filterH_ = conf.filter_size_y();
-  filterW_ = conf.filter_size();
-
-  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  configImgW_ = conf.img_size();
-
-  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  configOutW_ = conf.output_x();
-
-  configChannels_ = conf.channels();
-  configNumFilters_ = config_.num_filters();
-
-  isDeconv_ = (config_.type() == "conv") ? false : true;
-
-  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
-  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
-
-  groups_ = conf.groups();
-  CHECK_EQ(channels_ % groups_, 0);
-  CHECK_EQ(numFilters_ % groups_, 0);
-}
-
-void ConvBaseProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_,
-                              channels_ / groups_,
-                              numFilters_ / groups_,
-                              filterH_,
-                              filterW_);
-  hl_create_tensor_descriptor(&imageDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   imageDesc_,
-                                   filterDesc_,
-                                   paddingH_,
-                                   paddingW_,
-                                   strideH_,
-                                   strideW_,
-                                   dilationH_,
-                                   dilationW_);
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-}
-
-void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
-  // The stride between two consecutive samples in the output of ConvProjection
-  // may not be numFilters_ * outputH_ * outputW_ (conv) or
-  // channels_ * imageH_ * imageW_ (deconv)
-  // for example, in the case of layer ConcatenateLayer2 with two
-  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
-  // So the calculation of nStride is different from CudnnConvLayer.
-  size_t nStrideImage, nStrideOutput;
-  if (isDeconv_) {
-    nStrideImage = out_->value->getStride();
-    nStrideOutput = numFilters_ * outputH_ * outputW_;
-  } else {
-    nStrideImage = channels_ * imageH_ * imageW_;
-    nStrideOutput = out_->value->getStride();
-  }
-
-  hl_tensor_reshape(imageDesc_,
-                    batchSize,
-                    channels_ / groups_,
-                    imageH_,
-                    imageW_,
-                    nStrideImage,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-
-  hl_tensor_reshape(outputDesc_,
-                    batchSize,
-                    numFilters_ / groups_,
-                    outputH_,
-                    outputW_,
-                    nStrideOutput,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-
-  hl_reset_convolution_descriptor(convDesc_,
-                                  imageDesc_,
-                                  filterDesc_,
-                                  paddingH_,
-                                  paddingW_,
-                                  strideH_,
-                                  strideW_,
-                                  dilationH_,
-                                  dilationW_);
-}
-
-void ConvBaseProjection::reshape(int batchSize) {
-  size_t width = calOutputSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(calInputSize(), in_->value->getWidth());
-
-  reshapeTensorDesc(batchSize);
-  bool useDilation = false;
-  if (dilationH_ > 1 || dilationW_ > 1) {
-    useDilation = true;
-  }
-  hl_conv_workspace(imageDesc_,
-                    outputDesc_,
-                    filterDesc_,
-                    convDesc_,
-                    &fwdAlgo_,
-                    &fwdLimitBytes_,
-                    &bwdDataAlgo_,
-                    &bwdDataLimitBytes_,
-                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_,
-                    useDilation);
-
-  size_t maxWorkSpace = 0;
-  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-  workSpaceInBytes_ = maxWorkSpace;
-
-  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-}
-
-void *ConvBaseProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandlePtr> &convMem = *convMem_;
-  if (convMem.empty()) {
-    int numDevices = hl_get_device_count();
-    convMem.resize(numDevices);
-  }
-
-  int devId = hl_get_device();
-  MemoryHandlePtr localMem = convMem[devId];
-  if (NULL == localMem || size > localMem->getAllocSize()) {
-    localMem = std::make_shared<GpuMemoryHandle>(size);
-  }
-  return localMem->getBuf();
-}
-
-ConvBaseProjection::~ConvBaseProjection() {
-  hl_destroy_tensor_descriptor(imageDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_filter_descriptor(filterDesc_);
-  hl_destroy_convolution_descriptor(convDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.h b/paddle/legacy/gserver/layers/ConvBaseProjection.h
deleted file mode 100644
index dcf5ce0f4..000000000
--- a/paddle/legacy/gserver/layers/ConvBaseProjection.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Base class for ConvProjection and ConvTransProjection.
- */
-class ConvBaseProjection : public Projection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvBaseProjection(const ProjectionConfig& config,
-                     ParameterPtr parameter,
-                     bool useGpu);
-
-  ~ConvBaseProjection();
-
- protected:
-  void getConvParams();
-  void initCudnn();
-
-  void reshapeTensorDesc(int batchSize);
-  void reshape(int batchSize);
-
-  virtual size_t calOutputSize() = 0;
-  virtual size_t calInputSize() = 0;
-
-  static void* getSpaceBytes(size_t size);
-
-  /// True if it's deconv projection layer, false if it's ConvProjection layer
-  bool isDeconv_;
-  /// imageH_ and imageW_ / outputH_ and outputW_
-  /// is calculated from the input layer.
-  int imageH_, imageW_;
-  int outputH_, outputW_;
-  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
-  /// is obtained from config.
-  int configImgH_, configImgW_;
-  int configOutH_, configOutW_;
-  /// channels_ and numFilters_ are defined in terms of convolution semantics
-  int channels_, numFilters_;
-  /// configChannels and configNumFilters_ are obtained from config
-  /// For Conv they are the same as channels_ and numFilters
-  /// For ConvTrans they are opposite to channels_ and numFilters
-  int configChannels_, configNumFilters_;
-  int paddingH_, paddingW_;
-  int strideH_, strideW_;
-  int dilationH_, dilationW_;
-  int filterH_, filterW_;
-  /// One group offset of input data.
-  int inputOffset_;
-  /// One group offset of output data.
-  int outputOffset_;
-  /// One group offset of weight.
-  int weightOffset_;
-  int groups_;
-
-  /// Cudnn tensor descriptor for input.
-  hl_tensor_descriptor imageDesc_;
-  /// Cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  hl_filter_descriptor filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  hl_convolution_descriptor convDesc_;
-
-  /// Record the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  int fwdAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  int bwdFilterAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// the output.
-  int bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  size_t fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  size_t bwdDataLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  size_t bwdFilterLimitBytes_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-  bool bias_;
-
-  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.cpp b/paddle/legacy/gserver/layers/ConvOperator.cpp
deleted file mode 100644
index 5276b2c39..000000000
--- a/paddle/legacy/gserver/layers/ConvOperator.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(conv, ConvOperator);
-
-void ConvOperator::reshape(int batchSize) {
-  imageH_ = ins_[0]->getFrameHeight();
-  imageW_ = ins_[0]->getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSizeY_;
-  if (imageW_ == 0) imageW_ = imgSize_;
-  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the outputSizes are consistent with config
-  CHECK_EQ(outputH_, outputY_);
-  CHECK_EQ(outputW_, outputX_);
-  out_->setFrameHeight(outputH_);
-  out_->setFrameWidth(outputW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = channels_ * imageH_ * imageW_;
-  outputOffset_ = numFilters_ * outputH_ * outputW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(out_->value,
-                         batchSize,
-                         outputH_ * outputW_ * numFilters_,
-                         false,
-                         useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(imageDesc_,
-                             inputData,
-                             outputDesc_,
-                             outData,
-                             filterDesc_,
-                             wgtData,
-                             convDesc_,
-                             workSpace_,
-                             workSpaceInBytes_,
-                             fwdAlgo_);
-    }
-  }
-}
-
-void ConvOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       inputData,
-                                       outputDesc_,
-                                       outGrad,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(imageDesc_,
-                                     inputGrad,
-                                     outputDesc_,
-                                     outGrad,
-                                     filterDesc_,
-                                     wgtData,
-                                     convDesc_,
-                                     workSpace_,
-                                     workSpaceInBytes_,
-                                     bwdDataAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.h b/paddle/legacy/gserver/layers/ConvOperator.h
deleted file mode 100644
index 8f3162011..000000000
--- a/paddle/legacy/gserver/layers/ConvOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvOperator : public ConvBaseOperator {
- public:
-  ConvOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvProjection.cpp b/paddle/legacy/gserver/layers/ConvProjection.cpp
deleted file mode 100644
index b40cdac25..000000000
--- a/paddle/legacy/gserver/layers/ConvProjection.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvProjection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(conv, ConvProjection);
-
-size_t ConvProjection::calOutputSize() {
-  imageH_ = in_->getFrameHeight();
-  imageW_ = in_->getFrameWidth();
-  if (imageH_ == 0) imageH_ = configImgH_;
-  if (imageW_ == 0) imageW_ = configImgW_;
-  outputH_ = outputSize(imageH_,
-                        (filterH_ - 1) * dilationH_ + 1,
-                        paddingH_,
-                        strideH_,
-                        /* caffeMode */ true);
-  outputW_ = outputSize(imageW_,
-                        (filterW_ - 1) * dilationW_ + 1,
-                        paddingW_,
-                        strideW_,
-                        /* caffeMode */ true);
-
-  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
-  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
-
-  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
-  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
-  return outputH_ * outputW_ * configNumFilters_;
-}
-
-size_t ConvProjection::calInputSize() {
-  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
-}
-
-void ConvProjection::forward() {
-  int batchSize = in_->value->getHeight();
-  reshape(batchSize);
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
-
-    real *inputData = in_->value->getData() + g * inputOffset_;
-    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-    real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(imageDesc_,
-                           inputData,
-                           outputDesc_,
-                           outData,
-                           filterDesc_,
-                           wgtData,
-                           convDesc_,
-                           workSpace,
-                           fwdLimitBytes_,
-                           fwdAlgo_);
-  }
-}
-
-void ConvProjection::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    real *outGrad = out_->grad->getData() + g * outputOffset_;
-    if (weight_->getWGrad()) {
-      real *inputData = in_->value->getData() + g * inputOffset_;
-      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(imageDesc_,
-                                     inputData,
-                                     outputDesc_,
-                                     outGrad,
-                                     filterDesc_,
-                                     weightGrad,
-                                     convDesc_,
-                                     workSpace,
-                                     bwdFilterLimitBytes_,
-                                     bwdFilterAlgo_);
-    }
-
-    MatrixPtr preGrad = in_->grad;
-    if (NULL != preGrad) {
-      real *inputGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_backward_data(imageDesc_,
-                                   inputGrad,
-                                   outputDesc_,
-                                   outGrad,
-                                   filterDesc_,
-                                   wgtData,
-                                   convDesc_,
-                                   workSpace,
-                                   bwdDataLimitBytes_,
-                                   bwdDataAlgo_);
-    }
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvProjection.h b/paddle/legacy/gserver/layers/ConvProjection.h
deleted file mode 100644
index 890a17e2f..000000000
--- a/paddle/legacy/gserver/layers/ConvProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvProjection : public ConvBaseProjection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvShiftLayer.cpp b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
deleted file mode 100644
index b7ecbe556..000000000
--- a/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for circular convluation of two vectors,
- * which is used in NEURAL TURING MACHINE.
- * - Input: two vectors, the first is data (batchSize x dataDim)
- * the second is shift weights (batchSize x shiftDim)
- * - Output: a vector (batchSize x dataDim)
- * Assumed that:
- * - a[in]: contains M elements.
- * - b[in]: contains N elements (N should be odd).
- * - c[out]: contains M elements.
- *
- * \f[
- *     c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
- * \f]
- *
- * In this formula:
- *  - a's index is computed modulo M.
- *  - b's index is comupted modulo N.
- *
- * The config file api is conv_shift_layer.
- */
-
-class ConvShiftLayer : public Layer {
- public:
-  explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvShiftLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(conv_shift, ConvShiftLayer);
-
-bool ConvShiftLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ConvShiftLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dataDim = inV0->getWidth();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(dataDim, getSize());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
-  outV->circularConv(*inV0, *inV1);
-}
-
-void ConvShiftLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
-
-  if (inG0 && inG1) {
-    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
-  } else {
-    CHECK(!inG0 || !inG1) << "Not supported";
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.cpp b/paddle/legacy/gserver/layers/ConvTransOperator.cpp
deleted file mode 100644
index f4ce2affb..000000000
--- a/paddle/legacy/gserver/layers/ConvTransOperator.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvTransOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(convt, ConvTransOperator);
-
-void ConvTransOperator::reshape(int batchSize) {
-  outputH_ = ins_[0]->getFrameHeight();
-  outputW_ = ins_[0]->getFrameWidth();
-  if (outputH_ == 0) outputH_ = outputY_;
-  if (outputW_ == 0) outputW_ = outputX_;
-  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the imageSizes are consistent with config
-  CHECK_EQ(imageH_, imgSizeY_);
-  CHECK_EQ(imageW_, imgSize_);
-  out_->setFrameHeight(imageH_);
-  out_->setFrameWidth(imageW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = numFilters_ * outputH_ * outputW_;
-  outputOffset_ = channels_ * imageH_ * imageW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvTransOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(
-      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_backward_data(imageDesc_,
-                                   outData,
-                                   outputDesc_,
-                                   inputData,
-                                   filterDesc_,
-                                   wgtData,
-                                   convDesc_,
-                                   workSpace_,
-                                   workSpaceInBytes_,
-                                   bwdDataAlgo_);
-    }
-  }
-}
-
-void ConvTransOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       outGrad,
-                                       outputDesc_,
-                                       inputData,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_forward(imageDesc_,
-                               outGrad,
-                               outputDesc_,
-                               inputGrad,
-                               filterDesc_,
-                               wgtData,
-                               convDesc_,
-                               workSpace_,
-                               workSpaceInBytes_,
-                               fwdAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.h b/paddle/legacy/gserver/layers/ConvTransOperator.h
deleted file mode 100644
index 206335a01..000000000
--- a/paddle/legacy/gserver/layers/ConvTransOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvTransOperator : public ConvBaseOperator {
- public:
-  ConvTransOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvTransOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.cpp b/paddle/legacy/gserver/layers/ConvTransProjection.cpp
deleted file mode 100644
index 00e34c8f2..000000000
--- a/paddle/legacy/gserver/layers/ConvTransProjection.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvTransProjection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(convt, ConvTransProjection);
-size_t ConvTransProjection::calOutputSize() {
-  outputH_ = in_->getFrameHeight();
-  outputW_ = in_->getFrameWidth();
-  if (outputH_ == 0) outputH_ = configOutH_;
-  if (outputW_ == 0) outputW_ = configOutW_;
-  imageH_ = imageSize(outputH_,
-                      (filterH_ - 1) * dilationH_ + 1,
-                      paddingH_,
-                      strideH_,
-                      /* caffeMode */ true);
-
-  imageW_ = imageSize(outputW_,
-                      (filterW_ - 1) * dilationW_ + 1,
-                      paddingW_,
-                      strideW_,
-                      /* caffeMode */ true);
-
-  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
-  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
-
-  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
-  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
-  return imageH_ * imageW_ * configNumFilters_;
-}
-
-size_t ConvTransProjection::calInputSize() {
-  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
-}
-
-void ConvTransProjection::forward() {
-  int batchSize = in_->value->getHeight();
-  reshape(batchSize);
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
-
-    real *inData = in_->value->getData() + g * inputOffset_;
-    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-    real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_backward_data(imageDesc_,
-                                 outData,
-                                 outputDesc_,
-                                 inData,
-                                 filterDesc_,
-                                 wgtData,
-                                 convDesc_,
-                                 workSpace,
-                                 bwdDataLimitBytes_,
-                                 bwdDataAlgo_);
-  }
-}
-
-void ConvTransProjection::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    real *outGrad = out_->grad->getData() + g * outputOffset_;
-    if (weight_->getWGrad()) {
-      real *inData = in_->value->getData() + g * inputOffset_;
-      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(imageDesc_,
-                                     outGrad,
-                                     outputDesc_,
-                                     inData,
-                                     filterDesc_,
-                                     weightGrad,
-                                     convDesc_,
-                                     workSpace,
-                                     bwdFilterLimitBytes_,
-                                     bwdFilterAlgo_);
-    }
-
-    MatrixPtr preGrad = in_->grad;
-    if (NULL != preGrad) {
-      real *inGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_forward(imageDesc_,
-                             outGrad,
-                             outputDesc_,
-                             inGrad,
-                             filterDesc_,
-                             wgtData,
-                             convDesc_,
-                             workSpace,
-                             fwdLimitBytes_,
-                             fwdAlgo_);
-    }
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.h b/paddle/legacy/gserver/layers/ConvTransProjection.h
deleted file mode 100644
index 9b63dd473..000000000
--- a/paddle/legacy/gserver/layers/ConvTransProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvTransProjection : public ConvBaseProjection {
- public:
-  /**
-   * Constructor.
-   */
-  ConvTransProjection(const ProjectionConfig& config,
-                      ParameterPtr parameter,
-                      bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvTransProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
deleted file mode 100644
index c38ab251f..000000000
--- a/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for weighted sum of vectors,
- * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
- * TRANSLATE
- * - Input: the the size of the first input is weightDim,
- *          and the size of the second input is weightdim * dataDim.
- * - Output: the sizeof the output is dataDim
- * \f[
- *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
- *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
- * \f]
- * Note that the above computation is for one sample. Multiple samples are
- * processed in one batch.
- *
- * The config file api is linear_comb_layer.
- */
-class ConvexCombinationLayer : public Layer {
- protected:
-  /// A matrix pointer pointing to second input.
-  MatrixPtr tmpMtx0;
-  /// A matrix pointer pointing to first input.
-  MatrixPtr tmpRow0;
-  /// A matrix pointer pointing to output.
-  MatrixPtr tmpRow1;
-
- public:
-  explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvexCombinationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
-
-bool ConvexCombinationLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(2U, inputLayers_.size());
-  size_t dataDim = getSize();
-  size_t weightDim = inputLayers_[0]->getSize();
-
-  CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
-      << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           weightDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ weightDim,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  return true;
-}
-
-void ConvexCombinationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwCvxCombTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-    tmpRow0->setData(inV0->getData() + i * weightDim);
-    tmpRow1->setData(outV->getData() + i * dataDim);
-
-    tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0);
-  }
-}
-
-void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  REGISTER_TIMER_INFO("BwCvxCombTimer", getName().c_str());
-
-  if (inG0) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inG0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-
-      tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1);
-    }
-  }
-
-  if (inG1) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inV0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
-
-      tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.cpp b/paddle/legacy/gserver/layers/CosSimLayer.cpp
deleted file mode 100644
index ab8d7cc1f..000000000
--- a/paddle/legacy/gserver/layers/CosSimLayer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(cos, CosSimLayer);
-
-bool CosSimLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2LU);
-
-  createFunction(forward_,
-                 "CosSimForward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-  createFunction(backward_,
-                 "CosSimBackward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-
-  return true;
-}
-
-void CosSimLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
-
-  {
-    REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  /* activation */ {
-    REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
-    MatrixPtr prevOut1 = getInputValue(0);
-    MatrixPtr prevOut2 = getInputValue(1);
-
-    CHECK(outV && prevOut1 && prevOut2);
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*prevOut1);
-    inputs.addArg(*prevOut2);
-    outputs.addArg(*outV, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
-}
-
-void CosSimLayer::backward(const UpdateCallback& callback) {
-  /* activation */ {
-    REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
-    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
-
-    const auto outG = this->getOutputGrad();
-    const auto outV = this->getOutputValue();
-    const auto inV1 = this->getInputValue(0);
-    const auto inV2 = this->getInputValue(1);
-    auto inG1 = this->getInputGrad(0);
-    auto inG2 = this->getInputGrad(1);
-    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*outG);
-    inputs.addArg(*outV);
-    inputs.addArg(*inV1);
-    inputs.addArg(*inV2);
-    outputs.addArg(*inG1, ADD_TO);
-    outputs.addArg(*inG2, ADD_TO);
-
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.h b/paddle/legacy/gserver/layers/CosSimLayer.h
deleted file mode 100644
index b08e2c6a3..000000000
--- a/paddle/legacy/gserver/layers/CosSimLayer.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief A layer for calculating cosine similarity between two vector
- * \f[
- * f(x,y)=scale\frac{x_1y_1+x_2y_2+...+x_ny_n}{\sqrt{x_1^2+x_2^2+...
- * +x_n^2}\sqrt{y_1^2+y_2^2+...+y_n^2}}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim) *
- * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
- * - Output: A vector (batchSize * 1)
- *
- * The config file api is cos_sim.
- */
-class CosSimLayer : public Layer {
- public:
-  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
deleted file mode 100644
index 03de0be81..000000000
--- a/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-/**
- * @brief A layer for computing cosine similarity between a vector
- * and each row of a matrix
- * out[i] = cos_scale * cos(in1, in2(i,:));
- * @note used in NEURAL TURING MACHINE
- *
- * Input1: a vector (batchSize * dataDim)
- *
- * Input2: a matrix in vector form (batchSize * (weightDim*dataDim))
- *
- * Output: a vector (batchSize * weightDim)
- */
-
-class CosSimVecMatLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpMtx1;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-  MatrixPtr tmpRow2;
-  MatrixPtr tmpRow3;
-
- public:
-  explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimVecMatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
-
-bool CosSimVecMatLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dataDim = inputLayers_[0]->getSize();
-  size_t numKeys = getSize();
-  size_t memoryDim = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow2 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow3 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx1 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
-
-  createFunction(forward_,
-                 "CosSimForward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-  createFunction(backward_,
-                 "CosSimBackward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-
-  return true;
-}
-
-void CosSimVecMatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t numKeys = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, numKeys);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  CHECK(outV && inV0 && inV1);
-  REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpRow2, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
-}
-
-void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
-  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV0->getHeight();
-  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
-  REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
-
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpRow1->setData(inG0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpMtx1->setData(inG1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-    tmpRow3->setData(outG->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpRow3);
-    inputs.addArg(*tmpRow2);
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpMtx1, ADD_TO);
-    outputs.addArg(*tmpRow1, ADD_TO);
-
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CostLayer.cpp b/paddle/legacy/gserver/layers/CostLayer.cpp
deleted file mode 100644
index 18b5b77bd..000000000
--- a/paddle/legacy/gserver/layers/CostLayer.cpp
+++ /dev/null
@@ -1,748 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CostLayer.h"
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-
-bool CostLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  bool ret = Layer::init(layerMap, parameterMap);
-  coeff_ = config_.coeff();
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 2UL);
-  CHECK_LE(inputLayers_.size(), 3UL);
-  if (inputLayers_.size() == 3) {
-    weightLayer_ = inputLayers_[2];
-  }
-  return true;
-}
-
-void CostLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  int size = 1;
-  resetOutput(batchSize, size);
-
-  const MatrixPtr& output = getInputValue(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  /* get the cost value for each sample*/
-  forwardImp(*output, label, *getOutputValue());
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    getOutputValue()->dotMul(*getOutputValue(), *weight);
-  }
-}
-
-void CostLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  const Argument& output = getInput(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  bool support = true;
-  if (weightLayer_) {
-    support = output.grad->getAbsSum() == 0;
-  }
-
-  backwardImp(*output.value, label, *output.grad);
-
-  if (weightLayer_) {
-    CHECK(support) << "Weighted cost layer '" << getName()
-                   << "' must be the last layer "
-                      "connected to the output layer '"
-                   << getOutputLayer()->getName() << "'";
-    output.grad->rowScale(0, *output.grad, *getInputValue(*weightLayer_));
-  }
-  if (coeff_ != real(1.0f)) {
-    output.grad->add(coeff_, 0);
-  }
-}
-
-//
-// class MultiClassCrossEntropy
-//
-bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropy::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  target.oneHotCrossEntropy(output, *label.ids);
-}
-
-void MultiClassCrossEntropy::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-}
-
-//
-// class MultiClassCrossEntropyWithSelfNorm
-//
-REGISTER_LAYER(multi_class_cross_entropy_with_selfnorm,
-               MultiClassCrossEntropyWithSelfNorm);
-
-bool MultiClassCrossEntropyWithSelfNorm::init(
-    const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
-                                                    Argument& label,
-                                                    Matrix& target) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-  sftMaxSum_->log2();
-
-  target.oneHotCrossEntropy(output, *label.ids);
-  target.add(*sftMaxSum_);
-
-  sftMaxSum_->square2();
-  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
-}
-
-void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
-                                                     Argument& label,
-                                                     Matrix& outputG) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-
-  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
-  sftMaxSum_->reciprocal2(*sumInv_);
-
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-  outputG.addColumnVector(*sumInv_);
-
-  sftMaxSum_->log2();
-  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
-  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
-
-  outputG.addColumnVector(*sumInv_);
-}
-
-//
-// class SoftBinaryClassCrossEntropy
-//
-REGISTER_LAYER(soft_binary_class_cross_entropy, SoftBinaryClassCrossEntropy);
-
-bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
-                                       const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
-                                             Argument& label,
-                                             Matrix& target) {
-  Matrix::resizeOrCreate(
-      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-  targetPerDim_->softCrossEntropy(output, *label.value);
-  targetPerDim_->rowSum(target);
-}
-
-void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& outputG) {
-  outputG.softCrossEntropyBp(output, *label.value);
-}
-
-//
-// class SumOfSquaresCostLayer
-//
-
-REGISTER_LAYER(square_error, SumOfSquaresCostLayer);
-
-bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SumOfSquaresCostLayer::forwardImp(Matrix& output,
-                                       Argument& label,
-                                       Matrix& target) {
-  target.sumOfSquares(output, *label.value);
-}
-
-void SumOfSquaresCostLayer::backwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& outputG) {
-  outputG.sumOfSquaresBp(output, *label.value);
-}
-
-//
-// class SmoothL1CostLayer
-//
-
-REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
-
-bool SmoothL1CostLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SmoothL1CostLayer::forwardImp(Matrix& output,
-                                   Argument& label,
-                                   Matrix& target) {
-  MatrixPtr targetCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    targetCpu =
-        Matrix::create(target.getHeight(), target.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    targetCpu->copyFrom(target);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
-    target.copyFrom(*targetCpu);
-  } else {
-    target.smoothL1(output, *label.value, 1.0);
-  }
-}
-
-void SmoothL1CostLayer::backwardImp(Matrix& output,
-                                    Argument& label,
-                                    Matrix& outputG) {
-  MatrixPtr outputGCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    outputGCpu =
-        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    outputGCpu->copyFrom(outputG);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
-    outputG.copyFrom(*outputGCpu);
-  } else {
-    outputG.smoothL1Bp(output, *label.value, 1.0);
-  }
-}
-
-//
-// class RankingCost
-//
-bool RankingCost::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-
-  bool ret = Layer::init(layerMap, parameterMap);
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 3UL);
-  CHECK_LE(inputLayers_.size(), 4UL);
-  if (inputLayers_.size() == 4) {
-    weightLayer_ = inputLayers_[3];
-  }
-  return true;
-}
-
-void RankingCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer(0))->getHeight();
-  int size = 1;
-  resizeOutput(batchSize, size);
-  Matrix::resizeOrCreate(margin_, batchSize, size, /* trans= */ false, useGpu_);
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, try ids
-    IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
-    CHECK(idLabel) << "label layer has neither value nor ids";
-    CHECK_EQ((size_t)batchSize, idLabel->getSize());
-    Matrix::resizeOrCreate(
-        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
-    labelBuf_->copyFrom(*idLabel);
-    label = labelBuf_;
-  }
-
-  MatrixPtr output[] = {getInputValue(*getOutputLayer(0)),
-                        getInputValue(*getOutputLayer(1))};
-  MatrixPtr target = this->getOutputValue();
-  margin_->sub(*output[0], *output[1]);
-
-  // for validation
-  size_t height = output[0]->getHeight();
-  target->biggerThan(*(output[0]), *(output[1]), *label);
-  double total = static_cast<double>(height);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-    total = weight->getSum();
-  }
-  double pos = target->getSum();
-  posPairCount_ += pos;
-  negPairCount_ += (total - pos);
-
-  // forward
-  target->logisticRegressionLoss(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-  }
-}
-
-void RankingCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, but in ids
-    // use labelBuf_ (should already resized and copied during forward)
-    label = labelBuf_;
-  }
-
-  Matrix::resizeOrCreate(
-      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
-  marginGrad_->zeroMem();
-  marginGrad_->logisticRegressionLossBp(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    marginGrad_->dotMul(*marginGrad_, *weight);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-  getInputGrad(1)->sub(*marginGrad_);
-}
-
-void RankingCost::onPassEnd() {
-  double ratio = posPairCount_ / ((negPairCount_ <= 0) ? 1.0 : negPairCount_);
-  LOG(INFO) << "calc pos/neg: " << ratio << " pos= " << posPairCount_
-            << " neg= " << negPairCount_;
-
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-}
-
-//
-// class LambdaCost
-//
-REGISTER_LAYER(lambda_cost, LambdaCost);
-
-bool LambdaCost::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  truncationSize_ = config_.ndcg_num();
-  maxSortSize_ = config_.max_sort_size();
-  if (maxSortSize_ != -1) {
-    CHECK_GE(maxSortSize_, truncationSize_)
-        << "maxSortSize must be greater than or equal to NDCG size!";
-  }
-  LOG(INFO) << "LambdaRank v1.3, NDCG size = " << truncationSize_
-            << ", Max partial sort size = " << maxSortSize_;
-  CHECK(!useGpu_) << "LambdaRank supports CPU only!";
-  return Layer::init(layerMap, parameterMap);
-}
-
-void LambdaCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  resizeOutput(batchSize, 1);
-
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  MatrixPtr target = this->getOutputValue();
-
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-  real* targetData = target->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    real NDCG = calcNDCG(
-        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
-    for (int j = beginPos; j < endPos; ++j) {
-      targetData[j] = NDCG;
-    }
-  }
-}
-
-void LambdaCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  Matrix::resizeOrCreate(marginGrad_,
-                         score->getHeight(),
-                         1,
-                         /* trans= */ false,
-                         useGpu_);
-  marginGrad_->zeroMem();
-
-  real* gradData = marginGrad_->getData();
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    calcGrad(outputData + beginPos,
-             scoreData + beginPos,
-             gradData + beginPos,
-             endPos - beginPos);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-}
-
-void LambdaCost::calcGrad(const real* outputScore,
-                          const real* score,
-                          real* gradData,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-  int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
-
-  scorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    scorePair_.push_back(std::make_pair(score[i], i));
-  }
-  if (size <= sortSize) {
-    std::sort(scorePair_.begin(),
-              scorePair_.end(),
-              [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-                return a.first > b.first;
-              });
-  } else {
-    std::partial_sort(
-        scorePair_.begin(),
-        scorePair_.begin() + sortSize,
-        scorePair_.end(),
-        [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-          return a.first > b.first;
-        });
-  }
-
-  real maxDCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scorePair_[i].first) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  for (int i = 0; i < sortSize; ++i) {
-    for (int j = i + 1; j < size; ++j) {
-      int index_i = scorePair_[i].second;
-      int index_j = scorePair_[j].second;
-      real score_i = score[index_i];
-      real score_j = score[index_j];
-      real dcgDif = 0;
-      if (j < sortSize) {
-        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
-                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
-      } else {
-        dcgDif =
-            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
-      }
-
-      real lambda_ij =
-          -std::abs(dcgDif) /
-          (1 + std::exp(outputScore[index_i] - outputScore[index_j]));
-      gradData[index_i] += lambda_ij / maxDCG;
-      gradData[index_j] -= lambda_ij / maxDCG;
-    }
-  }
-}
-
-real LambdaCost::calcNDCG(const real* outputScore,
-                          const real* score,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-
-  outputScorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    outputScorePair_.push_back(std::make_pair(outputScore[i], i));
-  }
-  std::partial_sort(
-      outputScorePair_.begin(),
-      outputScorePair_.begin() + truncationSize_,
-      outputScorePair_.end(),
-      [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-        return a.first > b.first;
-      });
-
-  real DCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    DCG +=
-        (std::pow(2, score[outputScorePair_[i].second]) - 1) / std::log(i + 2);
-  }
-
-  scoreVec_.resize(size);
-  std::copy(score, score + size, scoreVec_.begin());
-  real maxDCG = 0;
-  std::partial_sort(scoreVec_.begin(),
-                    scoreVec_.begin() + truncationSize_,
-                    scoreVec_.end(),
-                    std::greater<real>());
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  return DCG / maxDCG;
-}
-
-//
-// class MultiBinaryLabelCrossEntropy
-//
-
-REGISTER_LAYER(multi_binary_label_cross_entropy, MultiBinaryLabelCrossEntropy);
-
-bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& target) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!label.value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    target.multiBinaryLabelCrossEntropy(output, *value);
-  } else {
-    Matrix::resizeOrCreate(
-        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-    targetPerDim_->binaryLabelCrossEntropy(output, *value);
-    targetPerDim_->rowSum(target);
-  }
-}
-
-void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
-                                               Argument& label,
-                                               Matrix& outputG) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    outputG.multiBinaryLabelCrossEntropyBp(output, *value);
-  } else {
-    outputG.binaryLabelCrossEntropyBp(output, *value);
-  }
-}
-
-bool HuberCost::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  CostLayer::init(layerMap, parameterMap);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
-  return true;
-}
-
-void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(
-          getInput(i), false, HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-}
-
-//
-// Huber loss for robust regression.
-//
-REGISTER_LAYER(huber_regression, HuberRegressionLoss);
-
-bool HuberRegressionLoss::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  HuberCost::init(layerMap, parameterMap);
-  delta_ = config_.delta();
-  return true;
-}
-
-void HuberRegressionLoss::forwardImp(Matrix& output,
-                                     Argument& label,
-                                     Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  size_t dim = output.getWidth();
-  CHECK(label.value);
-  CHECK_EQ((*label.value).getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(dim, (*label.value).getWidth());
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = std::abs(lbl[index] - out[index]);
-      if (a <= delta_)
-        cost[i] += a * a / 2;
-      else
-        cost[i] += delta_ * (a - delta_ / 2);
-    }
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberRegressionLoss::backwardImp(Matrix& output,
-                                      Argument& label,
-                                      Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  size_t dim = output.getWidth();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = lbl[index] - out[index];
-      if (std::abs(a) <= delta_)
-        grad[index] += -a;
-      else
-        grad[index] += a > 0 ? -delta_ : delta_;
-    }
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
-}
-
-//
-// Huber loss for robust 2-classes classification
-//
-REGISTER_LAYER(huber_classification, HuberTwoClassification);
-
-bool HuberTwoClassification::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return HuberCost::init(layerMap, parameterMap);
-}
-
-void HuberTwoClassification::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  CHECK(label.ids);
-  CHECK_EQ((*label.ids).getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), (size_t)1);
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      cost[i] = -4 * a;
-    else if (a < 1)
-      cost[i] = (1 - a) * (1 - a);
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberTwoClassification::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      grad[i] += -4 * y;
-    else if (a < 1)
-      grad[i] += -2 * (1 - a) * y;
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples);
-}
-/**
- * This cost layer compute the sum of its input as loss.
- * \f[
- * o(i) = \sum_{j=1}^D y_{ij}
- * \f]
- */
-class SumCostLayer : public Layer {
- public:
-  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    if (!ret) return ret;
-    CHECK_EQ(inputLayers_.size(), 1UL);
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    const MatrixPtr& input = getInputValue(0);
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = input->getHeight();
-    int size = 1;
-    resizeOutput(batchSize, size);
-    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  }
-
-  void backward(const UpdateCallback& callback = nullptr) override {
-    getInputGrad(0)->add((real)1);
-  }
-};
-
-REGISTER_LAYER(sum_cost, SumCostLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CostLayer.h b/paddle/legacy/gserver/layers/CostLayer.h
deleted file mode 100644
index 9bfec0e2b..000000000
--- a/paddle/legacy/gserver/layers/CostLayer.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * Base class for a particular type of cost layer.
- * This type of cost should have one data layer, one label layer
- * and an optional weight layer as input.
- * The derived class should implemnt forwardImp() and backwardImp()
- * which calculate the cost for data and label. The weight is automatically
- * handled by the base class.
- */
-class CostLayer : public Layer {
- public:
-  explicit CostLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[1]; }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  virtual void forwardImp(Matrix& outputValue,
-                          Argument& label,
-                          Matrix& cost) = 0;
-
-  virtual void backwardImp(Matrix& outputValue,
-                           Argument& label,
-                           Matrix& outputGrad) = 0;
-
- protected:
-  LayerPtr weightLayer_;
-  real coeff_;
-};
-
-/**
- * The cross-entropy loss for multi-class classification task.
- * The loss function is:
- *
- * \f[
- * L = - \sum_{i}{t_{k} * log(P(y=k))}
- * \f]
- */
-class MultiClassCrossEntropy : public CostLayer {
- public:
-  explicit MultiClassCrossEntropy(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/**
- * The cross-entropy with self-normalization for multi-class classification.
- *
- * The loss function is:
- * \f[
- * L = \sum_{i}[-log(P(x_{i})) + alpha * log(Z(x_{i})^2)]
- * \f]
- *
- * The \f$Z(x)\f$ is the softmax normalizer.
- *
- * [1] Jacob Devlin, Rabih Zbib, Zhongqiang Huang, Thomas Lamar,
- *     Richard Schwartz, and John Makhoul. Fast and robust neural
- *     network joint models for statistical machine translation.
- *     In Proceedings of the ACL 2014 Conference.
- */
-class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
- public:
-  explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-
- protected:
-  MatrixPtr sftMaxSum_;
-  MatrixPtr sumInv_;
-};
-
-/**
- * The cross-entropy for soft binary class.
- * \f[
- * L = \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
- * \f]
- */
-class SoftBinaryClassCrossEntropy : public CostLayer {
- public:
-  explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-
- protected:
-  MatrixPtr targetPerDim_;
-};
-
-/**
- * This cost layer compute Euclidean (L2) loss for real-valued regression
- * tasks.
- * \f[
- * L = \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
- * \f]
- */
-class SumOfSquaresCostLayer : public CostLayer {
- public:
-  explicit SumOfSquaresCostLayer(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/**
- * This cost layer compute smooth L1 loss for real-valued regression
- * tasks.
- * \f[
- * L =
- *   0.5 * x^2    if / -1 < |x| < 1 /
- *   |x| - 0.5    / otherwise /
- * \f]
- *
- * x = output - label
- */
-class SmoothL1CostLayer : public CostLayer {
- public:
-  explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/**
- * A cost layer for learning to rank (LTR) task. This layer contains at leat
- * three inputs.
- * \f[
- *  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
- *  o_{i,j} =  o_i - o_j  \\
- *  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
- * \f]
- *
- * [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
- *      Rank useing Gradient Descent.
- */
-class RankingCost : public Layer {
- public:
-  explicit RankingCost(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer(size_t i) { return inputLayers_[i]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[2]; }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  void onPassEnd() override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {
-    (void)output;
-    (void)label;
-    (void)cost;
-  }
-
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {
-    (void)outputValue;
-    (void)label;
-    (void)outputGrad;
-  }
-
- private:
-  double posPairCount_;
-  double negPairCount_;
-  MatrixPtr margin_;
-  MatrixPtr marginGrad_;
-  /// if input label is put in ids (not value), copy to this buffer.
-  MatrixPtr labelBuf_;
-  LayerPtr weightLayer_;
-};
-
-/**
- * LambdaRank os a method for learning arbitrary information retrieval
- * measures. It can be applied to any algorithm that learns through gradient
- * descent. LambdaRank is a listwise method, in that the cost depends on the
- * sorted order of the documents. LambdaRank gives the gradient of cost
- * function:
- *
- * \f[
- * \lambda_{ij} = \frac{1}{1 + e^{o_i - o_j}} \left| \Delta_{NDCG} \right|
- * \f]
- *
- * [1] Christopher J.C. Burges, Robert Ragno, Quoc Viet Le. Learning to Rank
- *     with Nonsmooth Cost Functions.
- */
-class LambdaCost : public Layer {
- public:
-  explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getScoreLayer() { return inputLayers_[1]; }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  real calcNDCG(const real* outputScore, const real* score, int size);
-  void calcGrad(const real* outputScore,
-                const real* score,
-                real* gradData,
-                int size);
-
- private:
-  MatrixPtr marginGrad_;
-  int truncationSize_;
-  int maxSortSize_;
-  std::vector<std::pair<real, int>> scorePair_;
-  std::vector<std::pair<real, int>> outputScorePair_;
-  std::vector<real> scoreVec_;
-};
-
-/**
- * Cross entropy for multi binary labels.
- * \f[
- * cost[i] = -sum(label[i][j]*log(output[i][j]) +
- *            (1-label[i][j])*log(1-output[i][j]))
- * \f]
- */
-class MultiBinaryLabelCrossEntropy : public CostLayer {
- protected:
-  MatrixPtr targetPerDim_;
-
- public:
-  explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/*
- * A base layer for HuberRegressionLoss and HuberTwoClassification.
- */
-class HuberCost : public CostLayer {
- public:
-  std::vector<Argument> tmpCpuInput_;
-
-  explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override {}
-};
-
-/**
- * Huber loss for robust regression.
- *
- * Given output f(x), label y and delta, the loss is:
- * Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\
- * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
- */
-class HuberRegressionLoss : public HuberCost {
- public:
-  explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-
- protected:
-  real delta_;
-};
-
-/**
- * Huber loss for robust 2-classes classification.
- *
- * For label={0, 1}, let y=2*label-1. Given output f(x), the loss is:
- * Loss = 4 * y * f, if y* f < -1 \\
- * Loss = (1 - y * f)^2, if -1 < y * f < 1  \\
- * Loss = 0, otherwise
- */
-class HuberTwoClassification : public HuberCost {
- public:
-  explicit HuberTwoClassification(const LayerConfig& config)
-      : HuberCost(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-typedef std::shared_ptr<CostLayer> CostLayerPtr;
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CropLayer.cpp b/paddle/legacy/gserver/layers/CropLayer.cpp
deleted file mode 100644
index d891375ec..000000000
--- a/paddle/legacy/gserver/layers/CropLayer.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-namespace paddle {
-
-REGISTER_LAYER(crop, CropLayer);
-
-bool CropLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_LE(static_cast<int>(inputLayers_.size()), 2);
-  CHECK_GE(static_cast<int>(inputLayers_.size()), 1);
-  crop_axis_ = config_.axis();
-  for (int i = 0; i < config_.offset_size(); i++) {
-    crop_offsets_.push_back(config_.offset(i));
-  }
-
-  // 1. get input_0 shape
-  auto& input0_img_conf = config_.inputs(0).image_conf();
-  inDims_ = TensorShape({0,
-                         input0_img_conf.channels(),
-                         input0_img_conf.has_img_size_y()
-                             ? input0_img_conf.img_size_y()
-                             : input0_img_conf.img_size(),
-                         input0_img_conf.img_size()});
-  // 2. get target dims from config
-  if (config_.inputs_size() == 1) {
-    targetDims_ = TensorShape({config_.shape(0),
-                               config_.shape(1),
-                               config_.shape(2),
-                               config_.shape(3)});
-  } else {
-    // 2. get input_1 shape
-    auto& input1_img_conf = config_.inputs(1).image_conf();
-    targetDims_ = TensorShape({0,
-                               input1_img_conf.channels(),
-                               input1_img_conf.has_img_size_y()
-                                   ? input1_img_conf.img_size_y()
-                                   : input1_img_conf.img_size(),
-                               input1_img_conf.img_size()});
-  }
-
-  // 3. get final crop corner
-  int dimSize = 4;
-  crop_corner_ = {0, 0, 0, 0};
-  for (int i = 0; i < dimSize; i++) {
-    if (i >= crop_axis_) {
-      if (crop_offsets_.size() > 1) {
-        crop_corner_[i] = crop_offsets_[i - crop_axis_];
-      } else {
-        crop_corner_[i] = crop_offsets_[0];
-      }
-    }
-  }
-
-  outDims_ = TensorShape(4);
-
-  createFunction(
-      forward_, "Crop", FuncConfig().set("crop_corner", crop_corner_));
-  createFunction(
-      backward_, "CropGrad", FuncConfig().set("crop_corner", crop_corner_));
-
-  return true;
-}
-
-void CropLayer::setOutDims() {
-  MatrixPtr input = inputLayers_[1]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  // get target dims from input_1
-  if (config_.inputs_size() == 2) {
-    targetDims_.setDim(0, batchSize);
-    int ch = config_.inputs(0).image_conf().channels();
-    if (ch != 0) targetDims_.setDim(1, ch);
-    int h = inputLayers_[1]->getOutput().getFrameHeight();
-    if (h != 0) targetDims_.setDim(2, h);
-    int w = inputLayers_[1]->getOutput().getFrameWidth();
-    if (w != 0) targetDims_.setDim(3, w);
-  }
-  // get final crop shape from target dims and crop axis
-  std::vector<uint32_t> crop_shape;
-  int dimSize = 4;
-  for (int i = 0; i < dimSize; i++) {
-    if (i >= crop_axis_) {
-      crop_shape.push_back(targetDims_[i]);
-    } else {
-      crop_shape.push_back(inDims_[i]);
-    }
-  }
-
-  outDims_.reshape(
-      {crop_shape[0], crop_shape[1], crop_shape[2], crop_shape[3]});
-  output_.setFrameHeight(crop_shape[2]);
-  output_.setFrameWidth(crop_shape[3]);
-}
-
-void CropLayer::setInDims() {
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  inDims_.setDim(0, batchSize);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-}
-
-void CropLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  setInDims();
-  setOutDims();
-  int size = outDims_[1] * outDims_[2] * outDims_[3];
-  resetOutput(outDims_[0], size);
-  MatrixPtr outV = getOutputValue();
-  REGISTER_TIMER_INFO("CropForward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-}
-
-void CropLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  REGISTER_TIMER_INFO("CropBackward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CropLayer.h b/paddle/legacy/gserver/layers/CropLayer.h
deleted file mode 100644
index ef88bc483..000000000
--- a/paddle/legacy/gserver/layers/CropLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  This layer crop input according to the specify conf.
- *         input_0: input to be cropped
- *         input_1: optional reference input
- *         axis: start dimension to be croped
- *         offset: offset of cropping  in each dimension
- *         shape: if reference input layer was not setted,
- *                  crop input as this shape conf
- */
-class CropLayer : public Layer {
- public:
-  explicit CropLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CropLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  void setOutDims();
-  void setInDims();
-
-  int32_t crop_axis_;
-  std::vector<uint32_t> crop_offsets_;
-  std::vector<uint32_t> crop_corner_;
-  TensorShape inDims_;
-  TensorShape targetDims_;
-  TensorShape outDims_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp b/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
deleted file mode 100644
index 0fe100a96..000000000
--- a/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
-                                                    size_t iter,
-                                                    size_t spatialDim) {
-  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
-                        channels_,
-                        spatialDim,
-                        false,
-                        useGpu_);
-}
-
-MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
-                                                     size_t iter,
-                                                     size_t spatialDim) {
-  return Matrix::create(
-      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
-}
-
-bool CrossChannelNormLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
-void CrossChannelNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inV = getInputValue(0);
-
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = inV->getWidth();
-  CHECK_EQ(getSize(), dataDim);
-
-  reserveOutput(batchSize, dataDim);
-  MatrixPtr outV = getOutputValue();
-  size_t spatialDim = dataDim / channels_;
-
-  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
-  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
-  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
-
-  inV->square2(*dataBuffer_);
-  for (size_t i = 0; i < batchSize; i++) {
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
-    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    // compute norm.
-    spatialBuffer_->sumCols(*dataTmp, 1, 0);
-    // add eps to avoid overflow
-    spatialBuffer_->add(1e-6);
-    spatialBuffer_->sqrt2(*spatialBuffer_);
-    normTmp->copyFrom(*spatialBuffer_);
-    outVTmp->copyFrom(*inVTmp);
-    outVTmp->divRowVector(*spatialBuffer_);
-    // scale the layer.
-    outVTmp->mulColVector(*scale_->getW());
-  }
-}
-
-void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr outV = getOutputValue();
-
-  size_t batchSize = inG->getHeight();
-  size_t dataDim = inG->getWidth();
-  size_t spatialDim = dataDim / channels_;
-
-  MatrixPtr inGBuffer;
-  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
-
-  dataBuffer_->dotMul(*outG, *outV);
-  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
-  scaleDiff_->zeroMem();
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
-    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    channelBuffer_->sumRows(*dataTmp, 1, 0);
-    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
-    // store a / scale[i] in scaleDiff_ temporary
-    scaleDiff_->add(*channelBuffer_, 1.);
-
-    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
-    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
-    // scale the grad
-    inGBuffer->copyFrom(*inVTmp);
-    inGBuffer->mulRowVector(*spatialBuffer_);
-    // divide by square of norm
-    spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGBuffer->divRowVector(*spatialBuffer_);
-    // subtract
-    inGBuffer->add(*outGTmp, -1, 1);
-    // divide by norm
-    inGBuffer->divRowVector(*normTmp);
-    // scale the diff
-    inGBuffer->mulColVector(*scale_->getW());
-
-    inGTmp->add(*inGBuffer);
-  }
-  // updata scale
-  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
-  scale_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
deleted file mode 100644
index f3bf21485..000000000
--- a/paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CrossEntropyOverBeam.h"
-
-namespace paddle {
-
-void CostForOneSequence::calValidExpandStep() {
-  validExpansionCount_ = 0;
-  goldAsExtraPath_ = true;
-
-  for (size_t i = 0; i < beams_->expansionCount; ++i) {
-    real gold = static_cast<real>(beams_->gold[i]);
-    if (i) {
-      real* start = beams_->candidateIds[i - 1]->getData();
-      goldRowIds_[i] = std::count_if(
-          start,
-          start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1],
-          [](const real& val) { return val != -1.; });
-    } else {
-      goldRowIds_[i] = 0;
-    }
-
-    real* start =
-        beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_;
-    real* findEnd = std::find(start, start + beamSize_, gold);
-    validExpansionCount_++;
-
-    if (start + beamSize_ == findEnd) return;
-    goldColIds_[i] = findEnd - start;
-  }
-  if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false;
-}
-
-size_t CostForOneSequence::initLastExpansion() {
-  int beamId = validExpansionCount_ - 1;
-  const MatrixPtr candidates = beams_->candidateIds[beamId];
-  size_t height = candidates->getHeight();
-
-  /* initialization the last expansion. */
-  size_t pathCount = std::count_if(candidates->getData(),
-                                   candidates->getData() + height * beamSize_,
-                                   [](const real& val) { return val != -1; });
-  /*
-   * if the gold sequence falls off the beam during search, add the gold
-   * sequence as the last path into the all expanded candidates.
-   */
-  if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++;
-
-  pathRowIdsInEachBeam_.clear();
-  pathRowIdsInEachBeam_.resize(validExpansionCount_,
-                               std::vector<int>(pathCount, 0));
-  parentIdsInBeam_.clear();
-  parentIdsInBeam_.resize(pathCount, 0);
-
-  if (goldAsExtraPath_) {
-    /* add gold sequence into the total expansion. */
-    pathRowIdsInEachBeam_[beamId].back() =
-        beams_->gold[beamId] +
-        getSeqStartPos(beamId, goldRowIds_[validExpansionCount_ - 1]);
-    parentIdsInBeam_.back() = goldRowIds_[validExpansionCount_ - 1];
-  } else {
-    size_t goldOffset = goldRowIds_[beamId] * beamSize_ + goldColIds_[beamId];
-    goldIdsInFinalExpansion_ =
-        std::count_if(candidates->getData(),
-                      candidates->getData() + goldOffset,
-                      [](const real& val) { return val != -1.; });
-  }
-
-  /*
-   * TODO(caoying): fix this, store the indices of selected candidate
-   * paths into Argument.ids
-   */
-  real* ids = candidates->getData();
-  size_t curIdx = 0;
-  for (size_t i = 0; i < height; ++i) {
-    int basePos = getSeqStartPos(beamId, i);
-    for (size_t j = 0; j < beamSize_; ++j) {
-      int id = ids[i * beamSize_ + j];
-      if (id == -1) continue;
-      pathRowIdsInEachBeam_[beamId][curIdx] = id + basePos;
-      parentIdsInBeam_[curIdx++] = i;
-    }
-  }
-  return pathCount;
-}
-
-void CostForOneSequence::constructTotalExpansion() {
-  /*
-   * construct the entire expanded beam by begining with the last search
-   * in which gold falls off the beam.
-   */
-  size_t totalPathCount = initLastExpansion();
-
-  for (int beamId = validExpansionCount_ - 2; beamId >= 0; --beamId) {
-    const MatrixPtr candidates = beams_->candidateIds[beamId];
-    real* ids = candidates->getData();
-
-    int lastParentIdInBeam = -1;
-    int basePos = -1;
-    for (size_t i = 0;
-         i < (goldAsExtraPath_ ? totalPathCount - 1 : totalPathCount);
-         ++i) {
-      int id = ids[parentIdsInBeam_[i]];
-      int parentRowId = std::div(parentIdsInBeam_[i], beamSize_).quot;
-      if (parentIdsInBeam_[i] != lastParentIdInBeam)
-        basePos = getSeqStartPos(beamId, parentRowId);
-
-      pathRowIdsInEachBeam_[beamId][i] = id + basePos;
-      lastParentIdInBeam = parentIdsInBeam_[i];
-      parentIdsInBeam_[i] = parentRowId;
-
-      if (goldAsExtraPath_)
-        pathRowIdsInEachBeam_[beamId][totalPathCount - 1] =
-            beams_->gold[beamId] + getSeqStartPos(beamId, goldRowIds_[beamId]);
-    }
-  }
-}
-
-real CostForOneSequence::globallyNormalizedScore() {
-  expandedPathScores_.resize(validExpansionCount_);
-
-  Matrix::resizeOrCreate(
-      softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false);
-  softmaxOut_->zeroMem();
-  MatrixPtr tmp = Matrix::create(
-      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
-
-  for (size_t i = 0; i < validExpansionCount_; ++i) {
-    Matrix::resizeOrCreate(expandedPathScores_[i],
-                           pathRowIdsInEachBeam_[i].size(),
-                           1,
-                           false,
-                           false);
-    expandedPathScores_[i]->zeroMem();
-
-    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
-                                        pathRowIdsInEachBeam_[i].size(),
-                                        false);
-    expandedPathScores_[i]->selectRows(*(beams_->scores[i]), *rowIds);
-    tmp->add(*expandedPathScores_[i]);
-  }
-
-  softmaxOut_->softmax(*softmaxOut_);
-  return -std::log(softmaxOut_->getData()[goldIdsInFinalExpansion_]);
-}
-
-real CostForOneSequence::forward() {
-  calValidExpandStep();
-  constructTotalExpansion();
-  return globallyNormalizedScore();
-}
-
-void CostForOneSequence::backward() {
-  /*
-   * when softmax layer is the output layer, and it is combined with
-   * cross-entropy as cost. The derivate with regard to softmax's input
-   * is simply:
-   *
-   * grad_i = softmax_out_i - target_i,
-   *
-   * and here hard label is used.
-   */
-  softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
-
-  MatrixPtr tmp = Matrix::create(
-      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
-
-  for (size_t i = 0; i < validExpansionCount_; ++i) {
-    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
-                                        pathRowIdsInEachBeam_[i].size(),
-                                        false);
-    /*
-      beams_->scoreGrad[i] has been intialized outside this class, this
-      class only keeps a pointer pointing to the original input gradients,
-      so here does not need to allocate or initalize the memory.
-    */
-    tmp->addToRows(*beams_->scoreGrad[i], *rowIds);
-  }
-}
-
-REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam);
-
-bool CrossEntropyOverBeam::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number.";
-
-  beamExpanCount_ = inputLayers_.size() / 3;
-
-  candidateScores_.resize(beamExpanCount_);
-  candidateScoreGrad_.resize(beamExpanCount_);
-
-  candidateInBeam_.resize(beamExpanCount_);
-  goldSequence_.resize(beamExpanCount_);
-  gradToInputs_.resize(beamExpanCount_);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void CrossEntropyOverBeam::checkInputs() {
-  batchSize_ = 0;
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    const Argument& scores = getInput(i * 3);
-    const Argument& selCandidates = getInput(i * 3 + 1);
-    const Argument& goldSeq = getInput(i * 3 + 2);
-
-    if (i) {
-      CHECK(scores.hasSubseq()) << "input " << i << " "
-                                << inputLayers_[i * 3]->getName()
-                                << " should be a nested sequence";
-      CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_);
-      CHECK_EQ(batchSize_, static_cast<size_t>(scores.getNumSequences()));
-      CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize());
-    } else {
-      CHECK(scores.hasSeq()) << "input " << i << " "
-                             << inputLayers_[i]->getName()
-                             << " should be a sequence";
-      batchSize_ = scores.getNumSequences();
-      beamSize_ = getInputValue(i * 3 + 1)->getWidth();
-      CHECK_EQ(batchSize_, static_cast<size_t>(selCandidates.getBatchSize()));
-    }
-    CHECK_EQ(1U, scores.value->getWidth());
-    CHECK_EQ(batchSize_, static_cast<size_t>(goldSeq.getBatchSize()));
-  }
-}
-
-void CrossEntropyOverBeam::copyInputsToCpu() {
-  auto copyValue = [](const MatrixPtr& src, MatrixPtr& trg) {
-    if (dynamic_cast<GpuMatrix*>(src.get())) {
-      Matrix::resizeOrCreate(
-          trg, src->getHeight(), src->getWidth(), false, false);
-      trg->copyFrom(*src);
-    } else {
-      trg = std::move(src);
-    }
-  };
-
-  auto copyIds = [](const IVectorPtr& src, IVectorPtr& trg) {
-    if (dynamic_cast<GpuIVector*>(src.get())) {
-      IVector::resizeOrCreate(trg, src->getSize(), false);
-      trg->copyFrom(*src);
-    } else {
-      trg = std::move(src);
-    }
-  };
-
-  beamSplitPos_.clear();
-  beamSplitPos_.resize(batchSize_, std::vector<int>(beamExpanCount_, 0));
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    copyValue(getInputValue(i * 3), candidateScores_[i]);
-    copyValue(getInputValue(i * 3 + 1), candidateInBeam_[i]);
-    copyIds(getInput(i * 3 + 2).ids, goldSequence_[i]);
-
-    if (i) {
-      ICpuGpuVectorPtr seqInfo = getInput(i * 3).sequenceStartPositions;
-      const int* seqStarts = seqInfo->getMutableData(false);
-      ICpuGpuVectorPtr subSeqInfo = getInput(i * 3).subSequenceStartPositions;
-      const int* subSeqStarts = subSeqInfo->getMutableData(false);
-
-      size_t seqId = 1;
-      for (size_t subSeqId = 0; subSeqId < subSeqInfo->getSize() - 1;
-           ++subSeqId) {
-        CHECK_LT(seqId, seqInfo->getSize());
-        if (subSeqStarts[subSeqId] == seqStarts[seqId]) {
-          beamSplitPos_[seqId][i] = beamSplitPos_[seqId - 1][i];
-          seqId++;
-        }
-        beamSplitPos_[seqId - 1][i]++;
-      }
-    } else {
-      for (size_t j = 0; j < batchSize_; ++j) beamSplitPos_[j][i] = j + 1;
-    }
-  }
-}
-
-void CrossEntropyOverBeam::splitBatchBeams() {
-  beamCosts_.resize(batchSize_);
-  beamPerSeq_.resize(batchSize_, BeamExpansion(beamExpanCount_));
-
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    int* seqStarts =
-        getInput(i * 3).sequenceStartPositions->getMutableData(false);
-
-    int* subSeqStarts = nullptr;
-    int maxLen = 0;
-    if (i) {
-      subSeqStarts =
-          getInput(i * 3).subSequenceStartPositions->getMutableData(false);
-      maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1;
-    } else {
-      maxLen = getInput(i).sequenceStartPositions->getSize() - 1;
-    }
-
-    for (size_t j = 0; j < batchSize_; ++j) {
-      beamPerSeq_[j].scores[i] =
-          Matrix::create(candidateScores_[i]->getData() + seqStarts[j],
-                         seqStarts[j + 1] - seqStarts[j],
-                         1,
-                         false,
-                         false);
-      beamPerSeq_[j].scoreGrad[i] =
-          Matrix::create(candidateScoreGrad_[i]->getData() + seqStarts[j],
-                         seqStarts[j + 1] - seqStarts[j],
-                         1,
-                         false,
-                         false);
-
-      int offset = j ? beamSplitPos_[j - 1][i] : 0;
-      int height = beamSplitPos_[j][i] - (j ? beamSplitPos_[j - 1][i] : 0);
-      CHECK_GE(maxLen, offset + height);
-      beamPerSeq_[j].seqInfo[i] = IVector::create(
-          (i ? subSeqStarts : seqStarts) + offset, height + 1, false);
-
-      beamPerSeq_[j].candidateIds[i] =
-          Matrix::create(candidateInBeam_[i]->getData() + offset * beamSize_,
-                         height,
-                         beamSize_,
-                         false,
-                         false);
-      beamPerSeq_[j].gold[i] = goldSequence_[i]->getData()[j];
-
-      CHECK_LE(beamPerSeq_[j].gold[i], seqStarts[j + 1] - seqStarts[j]);
-    }
-  }
-}
-
-void CrossEntropyOverBeam::resizeOutput() {
-  Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false);
-  output_.value->zeroMem();
-
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    MatrixPtr inGrad = getInputGrad(i * 3);
-    if (dynamic_cast<GpuMatrix*>(inGrad.get())) {
-      Matrix::resizeOrCreate(candidateScoreGrad_[i],
-                             inGrad->getHeight(),
-                             inGrad->getWidth(),
-                             false,
-                             false);
-    } else {
-      candidateScoreGrad_[i] = std::move(inGrad);
-    }
-    candidateScoreGrad_[i]->zeroMem();
-  }
-}
-
-void CrossEntropyOverBeam::copyGradToGpu(size_t copyCount) {
-  for (size_t i = 0; i < beamExpanCount_; ++i) {
-    if (dynamic_cast<GpuMatrix*>(getInputGrad(i * 3).get()))
-      getInputGrad(i * 3)->copyFrom(*candidateScoreGrad_[i]);
-
-    if (i == copyCount - 1) break;
-  }
-}
-
-void CrossEntropyOverBeam::forward(PassType passType) {
-  Layer::forward(passType);
-
-  checkInputs();
-  copyInputsToCpu();
-
-  resizeOutput();
-  splitBatchBeams();
-
-  MatrixPtr outputValue = getOutputValue();
-  for (size_t i = 0; i < batchSize_; ++i) {
-    BeamExpansionPtr ptr = std::make_shared<BeamExpansion>(beamPerSeq_[i]);
-    beamCosts_[i].setData(std::move(ptr), beamSize_);
-    outputValue->getData()[i] = beamCosts_[i].forward();
-  }
-}
-
-void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {
-  for (size_t i = 0; i < batchSize_; ++i) {
-    beamCosts_[i].backward();
-    copyGradToGpu(beamCosts_[i].getValidExpansionCount());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
deleted file mode 100644
index c8702b161..000000000
--- a/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "CrossEntropyOverBeam.h"
-#include "Layer.h"
-
-namespace paddle {
-
-/* This struct stores the beams in all search steps for a single sequence. */
-struct BeamExpansion {
-  std::vector<MatrixPtr> scores;
-  std::vector<IVectorPtr> seqInfo;
-
-  std::vector<MatrixPtr> candidateIds;
-  std::vector<int> gold;
-
-  std::vector<MatrixPtr> scoreGrad;
-
-  size_t expansionCount;
-
-  explicit BeamExpansion(int n) {
-    expansionCount = n;
-    scores.resize(expansionCount);
-    seqInfo.resize(expansionCount);
-    candidateIds.resize(expansionCount);
-    scoreGrad.resize(expansionCount);
-
-    gold.resize(expansionCount);
-  }
-};
-typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
-
-class CostForOneSequence {
- public:
-  CostForOneSequence()
-      : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
-  void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
-    beams_ = bPtr;
-    beamSize_ = beamSize;
-
-    expandedPathScores_.clear();
-    expandedPathScores_.resize(beams_->expansionCount);
-
-    goldRowIds_.clear();
-    goldRowIds_.resize(beams_->expansionCount, 0);
-    goldColIds_.clear();
-    goldColIds_.resize(beams_->expansionCount, -1);
-  }
-  size_t getValidExpansionCount() { return validExpansionCount_; }
-
-  real forward();
-  void backward();
-
- private:
-  void calValidExpandStep();
-  void constructTotalExpansion();
-  size_t initLastExpansion();
-  real globallyNormalizedScore();
-
-  int getSeqStartPos(size_t beamId, size_t rowId) {
-    CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId);
-    int* starts = beams_->seqInfo[beamId]->getData();
-    return starts[rowId] - starts[0];
-  }
-
-  size_t beamSize_;
-  size_t validExpansionCount_;
-  bool goldAsExtraPath_;
-  std::vector<int> goldRowIds_;
-  std::vector<int> goldColIds_;
-
-  BeamExpansionPtr beams_;
-  std::vector<std::vector<int>> pathRowIdsInEachBeam_;
-  std::vector<int> parentIdsInBeam_;
-  size_t goldIdsInFinalExpansion_;
-
-  std::vector<MatrixPtr> expandedPathScores_;
-
-  MatrixPtr softmaxOut_;
-};
-
-class CrossEntropyOverBeam : public Layer {
- public:
-  explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- private:
-  void checkInputs();
-  void copyInputsToCpu();
-  void resizeOutput();
-  void copyGradToGpu(size_t copyCount);
-  void splitBatchBeams();
-
-  size_t beamExpanCount_;
-  size_t batchSize_;
-  size_t beamSize_;
-
-  /*
-   * the process of constructing beams is not friendly to GPU, currently, this
-   * layer only runs on CPU, if any of its inputs is on GPU memory, then copy
-   * it to CPU memory.
-   */
-  std::vector<MatrixPtr> candidateScores_;
-  std::vector<MatrixPtr> candidateScoreGrad_;
-  std::vector<MatrixPtr> candidateInBeam_;
-  std::vector<MatrixPtr> gradToInputs_;
-  std::vector<IVectorPtr> goldSequence_;
-  std::vector<std::vector<int>> beamSplitPos_;
-
-  /*
-   * split entire bath of beams into beam per sequnence and store the result
-   * into this member.
-   */
-  std::vector<BeamExpansion> beamPerSeq_;
-  /* beamCosts_ is used to propagate error in one sequence. */
-  std::vector<CostForOneSequence> beamCosts_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
deleted file mode 100644
index 051155e0d..000000000
--- a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnBatchNormLayer.h"
-#include "Layer.h"
-#include "paddle/legacy/cuda/include/hl_batch_norm.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
-
-bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
-  CHECK(useGpu_) << "CudnnBatchNorm only support GPU";
-
-  hl_create_tensor_descriptor(&ioDesc_);
-  hl_create_tensor_descriptor(&bnParamDesc_);
-  hl_tensor_reshape(bnParamDesc_, 1, channels_, 1, 1);
-
-  return true;
-}
-
-void CudnnBatchNormLayer::reshape(int batchSize) {
-  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
-}
-
-void CudnnBatchNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInputValue(0)->getHeight();
-  calFeatureMapSize();
-  reshape(batchSize);
-  resetOutput(batchSize, getInputValue(0)->getWidth());
-
-  // for testing in training peroid.
-  useGlobalStats_ = (passType == PASS_TEST);
-  if (passType == PASS_TEST && config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* output = getOutputValue()->getData();
-  real* gamma = weight_->getW()->getData();
-  real* beta = biases_->getW()->getData();
-  real* movingMean = movingMean_->getW()->getData();
-  real* movingVar = movingVar_->getW()->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  if (!useGlobalStats_) {
-    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
-    real* savedMean = savedMean_->getData();
-    real* savedInvVar = savedInvVar_->getData();
-    hl_batch_norm_forward_training(ioDesc_,
-                                   input,
-                                   ioDesc_,
-                                   output,
-                                   bnParamDesc_,
-                                   gamma,
-                                   beta,
-                                   1.0 - movingAvgFraction_,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   savedMean,
-                                   savedInvVar);
-  } else {
-    // used movingMean and movingVar in testing
-    if (batchSize <= 1024) {
-      hl_batch_norm_forward_inference(ioDesc_,
-                                      input,
-                                      ioDesc_,
-                                      output,
-                                      bnParamDesc_,
-                                      gamma,
-                                      beta,
-                                      movingMean,
-                                      movingVar,
-                                      eps_);
-    } else {
-      // There is a limitation in cudnn library.
-      // When the batch size is larger than 1024 in cuDNN v5.1,
-      // the cudnnBatchNormalizationForwardInference will fail.
-      hl_batch_norm_cuda_inference(input,
-                                   output,
-                                   gamma,
-                                   beta,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   batchSize,
-                                   channels_,
-                                   imageH_ * imageD_,
-                                   imageW_);
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* outGrad = getOutputGrad()->getData();
-  real* inGrad = getInputGrad(0)->getData();
-  real* gamma = weight_->getW()->getData();
-  real* savedMean = savedMean_->getData();
-  real* savedInvVar = savedInvVar_->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
-    Matrix::resizeOrCreate(m, h, w, false, true);
-    m->zeroMem();
-    *p = m->getData();
-  };
-
-  real* gammaGrad = nullptr;
-  real* betaGrad = nullptr;
-  if (weight_->getWGrad()) {
-    gammaGrad = weight_->getWGrad()->getData();
-  } else {
-    create(tmpWGrad_, 1, channels_, &gammaGrad);
-  }
-  if (biases_ && biases_->getWGrad()) {
-    betaGrad = biases_->getWGrad()->getData();
-  } else {
-    create(tmpBiasGrad_, 1, channels_, &betaGrad);
-  }
-
-  hl_batch_norm_backward(ioDesc_,
-                         input,
-                         ioDesc_,
-                         outGrad,
-                         ioDesc_,
-                         inGrad,
-                         bnParamDesc_,
-                         gamma,
-                         gammaGrad,
-                         betaGrad,
-                         eps_,
-                         savedMean,
-                         savedInvVar);
-
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    biases_->getParameterPtr()->incUpdate(callback);
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-CudnnBatchNormLayer::~CudnnBatchNormLayer() {
-  hl_destroy_tensor_descriptor(ioDesc_);
-  hl_destroy_tensor_descriptor(bnParamDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
deleted file mode 100644
index 3b33b983b..000000000
--- a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cudnn.h>
-#include "BatchNormBaseLayer.h"
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
- * @note Cudnn version must >= v4.0, and better to use the latest version
- * (v5.1).
- *
- * The config file api is batch_norm_layer.
- */
-
-class CudnnBatchNormLayer : public BatchNormBaseLayer {
- public:
-  explicit CudnnBatchNormLayer(const LayerConfig& config)
-      : BatchNormBaseLayer(config) {}
-
-  ~CudnnBatchNormLayer();
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  /**
-   * reshape tensor of ioDesc_.
-   */
-  void reshape(int batchSize);
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  /// Epsilon value used in the batch normalization formula.
-  /// Same epsilon value should be used in forward and backward functions.
-  double eps_;
-
-  /// Input/output tensor descriptor desc
-  hl_tensor_descriptor ioDesc_;
-  /// Shared tensor descriptor desc for the 6 tenros:
-  /// bnScale, bnBias, running mean/var, save_mean/var
-  hl_tensor_descriptor bnParamDesc_;
-
-  /**
-   * @brief The gradient of weight and bias in cudnn api can not be empty.
-   * If set is_static for weight or bias, it will not allocate memory for them,
-   * and the gradient is NULL. In this case, will use two matrix.
-   */
-  MatrixPtr tmpWGrad_, tmpBiasGrad_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
deleted file mode 100644
index 9353cca9c..000000000
--- a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnConvBaseLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
-REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
-
-bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
-                              const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  CHECK(useGpu_) << "CudnnConvLayer only support gpu";
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.reserve(inputLayers_.size());
-  projConf_.reserve(inputLayers_.size());
-
-  numFilters_ = config_.num_filters();
-  CHECK(config_.shared_biases());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    ProjectionConfig *conf = new ProjectionConfig();
-    if (isDeconv_) {
-      conf->set_type("convt");
-    } else {
-      conf->set_type("conv");
-    }
-    conf->set_num_filters(numFilters_);
-    ConvConfig *convConf = conf->mutable_conv_conf();
-    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
-    conf->set_input_size(getPrev(i)->getSize());
-    conf->set_output_size(getSize());
-    projConf_.emplace_back(conf);
-    projections_.emplace_back(
-        Projection::create(*projConf_[i], parameters_[i], useGpu_));
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = (!isDeconv_) ? numFilters_ : channels_[i];
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }
-
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  if (biases_.get() && sharedBiases_) {
-    hl_create_tensor_descriptor(&biasDesc_);
-    hl_create_tensor_descriptor(&outputDesc_);
-    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
-  }
-
-  return true;
-}
-
-void CudnnConvBaseLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  resetOutput(batchSize, calOutputSize());
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->forward(&getInput(i), &getOutput(), passType);
-  }
-
-  if (biases_) {
-    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
-    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    int outH = outputH_[0];
-    int outW = outputW_[0];
-
-    hl_tensor_reshape(outputDesc_,
-                      batchSize,
-                      numFilters_,
-                      outH,
-                      outW,
-                      numFilters_ * outH * outW,
-                      outH * outW,
-                      outW,
-                      1);
-    real *outData = getOutputValue()->getData();
-    real *biasData = biases_->getW()->getData();
-    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
-  }
-
-  forwardActivation();
-}
-
-void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    real *biasGrad = biases_->getWGrad()->getData();
-    real *outGrad = getOutputGrad()->getData();
-    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
-
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->backward(callback);
-  }
-}
-
-CudnnConvBaseLayer::~CudnnConvBaseLayer() {
-  if (biases_) {
-    hl_destroy_tensor_descriptor(biasDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
deleted file mode 100644
index d050183eb..000000000
--- a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "Projection.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A 2-dimension conv layer implemented by cuDNN. It only
- *        supports GPU mode. We automatic select CudnnConvLayer for GPU
- *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
- *        User also can specfiy type of "exconv" or "cudnn_conv" for
- *        particular type.
- *
- * The config file api is img_conv_layer.
- */
-class CudnnConvBaseLayer : public ConvBaseLayer {
- protected:
-  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
-  std::vector<std::unique_ptr<Projection>> projections_;
-
-  hl_tensor_descriptor biasDesc_;
-  hl_tensor_descriptor outputDesc_;
-
- public:
-  explicit CudnnConvBaseLayer(const LayerConfig& config)
-      : ConvBaseLayer(config) {}
-
-  ~CudnnConvBaseLayer();
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
deleted file mode 100644
index c790dfd71..000000000
--- a/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnPoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-bool CudnnPoolLayer::typeCheck(const std::string &poolType,
-                               hl_pooling_mode_t *mode) {
-  if (poolType == "cudnn-max-pool") {
-    if (mode) {
-      *mode = HL_POOLING_MAX;
-    }
-  } else if (poolType == "cudnn-avg-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE;
-    }
-  } else if (poolType == "cudnn-avg-incl-pad-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
-    }
-  } else {
-    return false;
-  }
-
-  return true;
-}
-
-CudnnPoolLayer::CudnnPoolLayer(const LayerConfig &config) : PoolLayer(config) {
-  const std::string &pool_type = config.inputs(0).pool_conf().pool_type();
-  CHECK_EQ(CudnnPoolLayer::typeCheck(pool_type, &mode_), true);
-}
-
-bool CudnnPoolLayer::init(const LayerMap &layerMap,
-                          const ParameterMap &parameterMap) {
-  PoolLayer::init(layerMap, parameterMap);
-
-  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
-
-  hl_create_tensor_descriptor(&inputDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-
-  windowHeight = sizeY_;
-  windowWidth = sizeX_;
-  heightPadding = confPaddingY_;
-  widthPadding = confPadding_;
-  strideHeight = strideY_;
-  strideWidth = stride_;
-
-  hl_create_pooling_descriptor(&poolingDesc_,
-                               mode_,
-                               windowHeight,
-                               windowWidth,
-                               heightPadding,
-                               widthPadding,
-                               strideHeight,
-                               strideWidth);
-
-  return true;
-}
-
-void CudnnPoolLayer::reshape(int batchSize) {
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) {
-    imageH_ = imgSizeY_;
-  }
-  if (imageW_ == 0) {
-    imageW_ = imgSize_;
-  }
-  CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
-           channels_ * imageH_ * imageW_);
-  outputH_ = outputSize(imageH_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputW_ =
-      outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_);
-  hl_tensor_reshape(outputDesc_, batchSize, channels_, outputH_, outputW_);
-}
-
-void CudnnPoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  CHECK(inputLayers_[0]->getOutputValue()->useGpu());
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * channels_);
-
-  real *inputData = getInputValue(0)->getData();
-  real *outData = getOutputValue()->getData();
-  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData, poolingDesc_);
-}
-
-void CudnnPoolLayer::backward(const UpdateCallback &callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  real *inputData = getInputValue(0)->getData();
-  real *inputGrad = getInputGrad(0)->getData();
-  real *outData = getOutputValue()->getData();
-  real *outGrad = getOutputGrad()->getData();
-  hl_pooling_backward(inputDesc_,
-                      inputData,
-                      inputGrad,
-                      outputDesc_,
-                      outData,
-                      outGrad,
-                      poolingDesc_);
-}
-
-CudnnPoolLayer::~CudnnPoolLayer() {
-  hl_destroy_tensor_descriptor(inputDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_pooling_descriptor(poolingDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnPoolLayer.h b/paddle/legacy/gserver/layers/CudnnPoolLayer.h
deleted file mode 100644
index fc249354d..000000000
--- a/paddle/legacy/gserver/layers/CudnnPoolLayer.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "PoolLayer.h"
-
-namespace paddle {
-
-/**
- * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
- * cudnn api and only supports GPU.
- *
- * The config file api is img_pool_layer.
- */
-
-class CudnnPoolLayer : public PoolLayer {
- protected:
-  int windowHeight, windowWidth;
-  int heightPadding, widthPadding, strideHeight, strideWidth;
-  int imageH_, imageW_, outputH_, outputW_;
-  /// mode_ is poolint type, inlcuding "cudnn-max-pool", "cudnn-avg-pool"
-  /// "cudnn-avg-excl-pad-pool".
-  hl_pooling_mode_t mode_;
-  /// cudnn tensor descriptor for input.
-  hl_tensor_descriptor inputDesc_;
-  /// cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// A description of a pooling operation.
-  hl_pooling_descriptor poolingDesc_;
-
- public:
-  static bool typeCheck(const std::string& poolType,
-                        hl_pooling_mode_t* mode = nullptr);
-  explicit CudnnPoolLayer(const LayerConfig& config);
-  ~CudnnPoolLayer();
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * Reshape input and output tensor descriptor.
-   * The batch size maybe change during training in last batch of each pass.
-   * So reshaping is needed.
-   */
-  void reshape(int batchSize);
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataLayer.cpp b/paddle/legacy/gserver/layers/DataLayer.cpp
deleted file mode 100644
index 4cadaa766..000000000
--- a/paddle/legacy/gserver/layers/DataLayer.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(data, DataLayer);
-
-void DataLayer::copyDataToOutput(Argument& output) {
-  if (output.deviceId == data_.deviceId) {
-    output.value = data_.value;
-    output.in = data_.in;
-    output.grad = data_.grad;
-    output.ids = data_.ids;
-  } else {
-    SetDevice device(output.deviceId);
-    if (data_.value) {
-      if (!output.value) {
-        output.value = data_.value->clone(data_.value->getHeight(),
-                                          data_.value->getWidth(),
-                                          useGpu(output.deviceId));
-      } else {
-        output.value->resize(data_.value->getHeight(), data_.value->getWidth());
-      }
-      output.value->copyFrom(*data_.value);
-    }
-    if (data_.grad) {
-      Matrix::resizeOrCreate(output.grad,
-                             data_.grad->getHeight(),
-                             data_.grad->getWidth(),
-                             /* trans= */ false,
-                             useGpu(output.deviceId));
-    }
-    if (data_.ids) {
-      IVector::resizeOrCreate(
-          output.ids, data_.ids->getSize(), useGpu(output.deviceId));
-      output.ids->copyFrom(*data_.ids);
-    }
-  }
-  if (config_.height() && config_.width()) {
-    output.setFrameHeight(config_.height());
-    output.setFrameWidth(config_.width());
-  } else {
-    output.setFrameHeight(data_.getFrameHeight());
-    output.setFrameWidth(data_.getFrameWidth());
-  }
-  output.cpuSequenceDims = data_.cpuSequenceDims;
-  output.sequenceStartPositions = data_.sequenceStartPositions;
-  output.subSequenceStartPositions = data_.subSequenceStartPositions;
-  output.strs = data_.strs;
-
-  output.notifyValueReady();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataLayer.h b/paddle/legacy/gserver/layers/DataLayer.h
deleted file mode 100644
index d02f5a469..000000000
--- a/paddle/legacy/gserver/layers/DataLayer.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "Layer.h"
-
-namespace paddle {
-/**
- * This layer just copy data to output, and has no backward propagation.
- *
- * The config file api is data_layer.
- */
-class DataLayer : public Layer {
- public:
-  explicit DataLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual void setData(const Argument& data) { data_ = data; }
-
-  /**
-   * Prefetch sparse matrix/ids only.
-   */
-  void prefetch() override { output_ = data_; }
-
-  /**
-   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
-   * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
-   */
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    copyDataToOutput(output_);
-    if (FLAGS_show_layer_stat) {
-      showOutputStats();
-    }
-  }
-
-  /**
-   * Data layer's backward propagation do nothing.
-   */
-  void backward(const UpdateCallback& callback) override { (void)callback; }
-
-  void copyOutputToOtherDevice() override {
-    for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-      copyDataToOutput(outputOtherDevice_[i]);
-    }
-  }
-
- private:
-  void copyDataToOutput(Argument& output);
-
- protected:
-  Argument data_;
-};
-
-typedef std::shared_ptr<DataLayer> DataLayerPtr;
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.cpp b/paddle/legacy/gserver/layers/DataNormLayer.cpp
deleted file mode 100644
index 6820dfa4d..000000000
--- a/paddle/legacy/gserver/layers/DataNormLayer.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataNormLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(data_norm, DataNormLayer);
-
-bool DataNormLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weight */
-  CHECK(!biasParameter_) << "DataNormLayer does not need bias";
-  CHECK(inputLayers_.size() == 1 && inputLayers_[0]->getType() == "data")
-      << "DataNormLayer accepts one and only one DataLayer as its input layer";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_[0]->getSize(), getSize());
-  CHECK_EQ(parameters_[0]->getSize(), 5 * getSize());
-  CHECK(parameters_[0]->isStatic())
-      << "The parameter of DataNormLayer must be static";
-
-  weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
-  min_ = Matrix::create(
-      nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  rangeReciprocal_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-  mean_ = Matrix::create(nullptr,
-                         /* height= */ 1,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  stdReciprocal_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  getSize(),
-                                  /* trans= */ false,
-                                  useGpu_);
-  decimalReciprocal_ = Matrix::create(nullptr,
-                                      /* height= */ 1,
-                                      getSize(),
-                                      /* trans= */ false,
-                                      useGpu_);
-
-  min_->setData(weight_->getW()->getData());
-  rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
-  mean_->setData(weight_->getW()->getData() + 2 * getSize());
-  stdReciprocal_->setData(weight_->getW()->getData() + 3 * getSize());
-  decimalReciprocal_->setData(weight_->getW()->getData() + 4 * getSize());
-
-  /* normalization strategy */
-  if (config_.data_norm_strategy() == "z-score") {
-    mode_ = kZScore;
-  } else if (config_.data_norm_strategy() == "min-max") {
-    mode_ = kMinMax;
-  } else if (config_.data_norm_strategy() == "decimal-scaling") {
-    mode_ = kDecimalScaling;
-  } else {
-    LOG(FATAL) << "Unknown data normalization strategy: "
-               << config_.data_norm_strategy();
-  }
-
-  return true;
-}
-
-void DataNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-
-  const MatrixPtr inValue = getInputValue(0);
-  MatrixPtr outValue = getOutputValue();
-  outValue->copyFrom(*inValue);
-  switch (mode_) {
-    case kZScore: {
-      outValue->addBias(*mean_, -1.0);
-      outValue->colScale(0, *outValue, *stdReciprocal_);
-      break;
-    }
-    case kMinMax: {
-      outValue->addBias(*min_, -1.0);
-      outValue->colScale(0, *outValue, *rangeReciprocal_);
-      break;
-    }
-    case kDecimalScaling: {
-      outValue->colScale(0, *outValue, *decimalReciprocal_);
-      break;
-    }
-    default:
-      LOG(FATAL) << "should not reach here";
-  }
-}
-
-void DataNormLayer::backward(const UpdateCallback& callback) {
-  // The parameter for DataNormLayer is static, and does not need to be updated
-  (void)callback;
-
-  /* Calculate the input layers error */
-  const MatrixPtr& outGrad = getOutputGrad();
-  MatrixPtr inGrad = getInputGrad(0);
-  if (inGrad) {
-    switch (mode_) {
-      case kZScore: {
-        inGrad->addColScale(0, *outGrad, *stdReciprocal_);
-        break;
-      }
-      case kMinMax: {
-        inGrad->addColScale(0, *outGrad, *rangeReciprocal_);
-        break;
-      }
-      case kDecimalScaling: {
-        inGrad->addColScale(0, *outGrad, *decimalReciprocal_);
-        break;
-      }
-      default: { LOG(FATAL) << "should not reach here"; }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.h b/paddle/legacy/gserver/layers/DataNormLayer.h
deleted file mode 100644
index 7bb8e9282..000000000
--- a/paddle/legacy/gserver/layers/DataNormLayer.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for data normalization
- * - Input: One and only one input layer is accepted. The input layer must
- *        be DataLayer with dense data type.
- * - Output: The normalization of the input data
- *
- * Reference:
- *    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
- *
- * Three data normalization methoeds are considered
- * - z-score: y = (x-mean)/std
- * - min-max: y = (x-min)/(max-min)
- * - decimal-scaling: y = x/10^j, where j is the smallest integer such that
- *max(|y|)<1
- */
-
-class DataNormLayer : public Layer {
- public:
-  enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
-
-  explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DataNormLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  int mode_;
-  std::unique_ptr<Weight> weight_;
-  MatrixPtr min_;
-  MatrixPtr rangeReciprocal_;  // 1/(max-min)
-  MatrixPtr mean_;
-  MatrixPtr stdReciprocal_;      // 1/std
-  MatrixPtr decimalReciprocal_;  // 1/10^j
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.cpp b/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
deleted file mode 100644
index 2cd635564..000000000
--- a/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DeConv3DLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(deconv3d, DeConv3DLayer);
-
-bool DeConv3DLayer::init(const LayerMap &layerMap,
-                         const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  // for Deconv, the dimension of Kernel is
-  // channel * output * depth * height * weigth
-  // Matrix storage format: (output * depth * height * weigth) x  channel
-  for (int index = 0; index < config_.inputs().size(); ++index) {
-    M_.push_back(filterChannels_[index]);
-    K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[index] * numFilters_;
-    width = filterChannels_[index];
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-  }
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  return true;
-}
-
-size_t DeConv3DLayer::getSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imgSizeW_.clear();
-  imgSizeH_.clear();
-  imgSizeD_.clear();
-  N_.clear();
-  NOut_.clear();
-  size_t layerSize = 0;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    imgSizeW_.push_back(
-        imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
-    imgSizeH_.push_back(imageSize(
-        outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
-    imgSizeD_.push_back(imageSize(
-        outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
-    NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
-    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
-    layerSize += NOut_[i] * numFilters_;
-  }
-  getOutput().setFrameHeight(imgSizeH_[0]);
-  getOutput().setFrameWidth(imgSizeW_[0]);
-  getOutput().setFrameDepth(imgSizeD_[0]);
-  return layerSize;
-}
-
-void DeConv3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-  const MatrixPtr outMat = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr &inMat = getInputValue(i);
-    int M = M_[i];
-    int N = N_[i];
-    int K = K_[i];
-    MatrixPtr wMat = weights_[i]->getW();
-    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-    for (int n = 0; n < batchSize; ++n) {
-      real *inData = inMat->getData() + n * inMat->getStride();
-      for (int g = 0; g < groups_[i]; ++g) {
-        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
-        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
-        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
-        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
-        inData += M * N;
-      }
-      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
-                       numFilters_,
-                       imgSizeD_[i],
-                       imgSizeH_[i],
-                       imgSizeW_[i],
-                       filterSizeZ_[i],
-                       filterSizeY_[i],
-                       filterSize_[i],
-                       strideZ_[i],
-                       strideY_[i],
-                       stride_[i],
-                       paddingZ_[i],
-                       paddingY_[i],
-                       padding_[i],
-                       1.0,
-                       1.0);
-    }
-  }
-  if (nullptr != this->biasParameter_) {
-    this->addBias();
-  }
-  forwardActivation();
-}
-
-void DeConv3DLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-  int batchSize = getOutputGrad()->getHeight();
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases();
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    if (weights_[i]->getWGrad() || this->needGradient_) {
-      int M = M_[i];
-      int N = N_[i];
-      int K = K_[i];
-      Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-      const MatrixPtr &inMat = getInputValue(i);
-      for (int n = 0; n < batchSize; ++n) {
-        colBuf_->vol2Col(
-            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
-            numFilters_,
-            imgSizeD_[i],
-            imgSizeH_[i],
-            imgSizeW_[i],
-            filterSizeZ_[i],
-            filterSizeY_[i],
-            filterSize_[i],
-            strideZ_[i],
-            strideY_[i],
-            stride_[i],
-            paddingZ_[i],
-            paddingY_[i],
-            padding_[i]);
-        if (weights_[i]->getWGrad()) {
-          real *inData = inMat->getData() + n * inMat->getStride();
-          for (int g = 0; g < groups_[i]; ++g) {
-            MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
-            MatrixPtr wGradMatSub =
-                weights_[i]->getWGrad()->subMatrix(g * K, K);
-            MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
-            wGradMatSub->mul(
-                *colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
-            inData += M * N;
-          }
-        }
-        if (getInputGrad(i)) {
-          real *preGrad =
-              getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
-          for (int g = 0; g < groups_[i]; ++g) {
-            MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
-            MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
-            MatrixPtr inGradMatSub =
-                Matrix::create(preGrad, M, N, false, useGpu_);
-            inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
-            preGrad += M * N;
-          }
-        }
-      }
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-void DeConv3DLayer::bpropWeights(int i) {}
-void DeConv3DLayer::bpropData(int i) {}
-
-void DeConv3DLayer::bpropBiases() {
-  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
-                                    1,
-                                    biases_->getWGrad()->getElementCnt(),
-                                    false,
-                                    useGpu_);
-  const MatrixPtr &outGradMat = getOutputGrad();
-
-  if (this->sharedBiases_) {
-    biases->collectSharedBias(*outGradMat, 1.0f);
-  } else {
-    biases->collectBias(*outGradMat, 1.0f);
-  }
-}
-
-void DeConv3DLayer::addBias() {
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  if (this->sharedBiases_) {
-    outMat->addSharedBias(*(bias), 1.0f);
-  } else {
-    outMat->addBias(*(bias), 1.0f);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.h b/paddle/legacy/gserver/layers/DeConv3DLayer.h
deleted file mode 100644
index 9931bccb1..000000000
--- a/paddle/legacy/gserver/layers/DeConv3DLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of deconvolution3D layer.
- * This layer expands input and use matrix multiplication to
- * calculate deconvolution3D operation.
- */
-class DeConv3DLayer : public ConvBaseLayer {
- public:
-  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~DeConv3DLayer() {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
- protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  IntV NOut_;
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DetectionOutputLayer.cpp b/paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
deleted file mode 100644
index 93fe046c6..000000000
--- a/paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DetectionOutputLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(detection_output, DetectionOutputLayer);
-
-bool DetectionOutputLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  auto& layerConf = config_.inputs(0).detection_output_conf();
-  numClasses_ = layerConf.num_classes();
-  inputNum_ = layerConf.input_num();
-  nmsThreshold_ = layerConf.nms_threshold();
-  confidenceThreshold_ = layerConf.confidence_threshold();
-  nmsTopK_ = layerConf.nms_top_k();
-  keepTopK_ = layerConf.keep_top_k();
-  backgroundId_ = layerConf.background_id();
-  return true;
-}
-
-void DetectionOutputLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
-
-  locSizeSum_ = 0;
-  confSizeSum_ = 0;
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
-    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
-    locSizeSum_ += inLoc->getElementCnt();
-    confSizeSum_ += inConf->getElementCnt();
-  }
-
-  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
-
-  size_t locOffset = 0;
-  size_t confOffset = 0;
-  auto& layerConf = config_.inputs(0).detection_output_conf();
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
-    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
-
-    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
-    if (!height) height = layerConf.height();
-    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
-    if (!width) width = layerConf.width();
-    locOffset += appendWithPermute(*inLoc,
-                                   height,
-                                   width,
-                                   locSizeSum_,
-                                   locOffset,
-                                   batchSize,
-                                   *locTmpBuffer_,
-                                   kNCHWToNHWC);
-    confOffset += appendWithPermute(*inConf,
-                                    height,
-                                    width,
-                                    confSizeSum_,
-                                    confOffset,
-                                    batchSize,
-                                    *confTmpBuffer_,
-                                    kNCHWToNHWC);
-  }
-  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
-  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
-
-  MatrixPtr priorValue;
-  if (useGpu_) {
-    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
-    Matrix::resizeOrCreate(
-        confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false);
-    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
-    Matrix::resizeOrCreate(
-        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
-
-    locCpuBuffer_->copyFrom(*locTmpBuffer_);
-    confCpuBuffer_->copyFrom(*confTmpBuffer_);
-    priorCpuValue_->copyFrom(*priorTmpValue);
-
-    locBuffer_ = locCpuBuffer_;
-    confBuffer_ = confCpuBuffer_;
-    priorValue = priorCpuValue_;
-  } else {
-    priorValue = getInputValue(*getPriorBoxLayer());
-    locBuffer_ = locTmpBuffer_;
-    confBuffer_ = confTmpBuffer_;
-  }
-  confBuffer_->softmax(*confBuffer_);
-
-  size_t numPriors = priorValue->getElementCnt() / 8;
-  std::vector<std::vector<NormalizedBBox>> allDecodedBBoxes;
-  for (size_t n = 0; n < batchSize; ++n) {
-    std::vector<NormalizedBBox> decodedBBoxes;
-    for (size_t i = 0; i < numPriors; ++i) {
-      size_t priorOffset = i * 8;
-      size_t locPredOffset = n * numPriors * 4 + i * 4;
-      std::vector<NormalizedBBox> priorBBoxVec;
-      getBBoxFromPriorData(
-          priorValue->getData() + priorOffset, 1, priorBBoxVec);
-      std::vector<std::vector<real>> priorBBoxVar;
-      getBBoxVarFromPriorData(
-          priorValue->getData() + priorOffset, 1, priorBBoxVar);
-      std::vector<real> locPredData;
-      for (size_t j = 0; j < 4; ++j)
-        locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
-      NormalizedBBox bbox =
-          decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData);
-      decodedBBoxes.push_back(bbox);
-    }
-    allDecodedBBoxes.push_back(decodedBBoxes);
-  }
-
-  std::vector<std::map<size_t, std::vector<size_t>>> allIndices;
-  size_t numKept = getDetectionIndices(confBuffer_->getData(),
-                                       numPriors,
-                                       numClasses_,
-                                       backgroundId_,
-                                       batchSize,
-                                       confidenceThreshold_,
-                                       nmsTopK_,
-                                       nmsThreshold_,
-                                       keepTopK_,
-                                       allDecodedBBoxes,
-                                       &allIndices);
-
-  if (numKept > 0) {
-    resetOutput(numKept, 7);
-  } else {
-    MatrixPtr outV = getOutputValue();
-    if (outV) outV->resize(0, 0);
-    return;
-  }
-  MatrixPtr outV = getOutputValue();
-  getDetectionOutput(confBuffer_->getData(),
-                     numKept,
-                     numPriors,
-                     numClasses_,
-                     batchSize,
-                     allIndices,
-                     allDecodedBBoxes,
-                     *outV);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DetectionOutputLayer.h b/paddle/legacy/gserver/layers/DetectionOutputLayer.h
deleted file mode 100644
index b0270ed33..000000000
--- a/paddle/legacy/gserver/layers/DetectionOutputLayer.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include "DetectionUtil.h"
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * The detection output layer for a SSD detection task. This layer applies the
- * Non-maximum suppression to the all predicted bounding box and keeps the
- * Top-K bounding boxes.
- * - Input: This layer needs three input layers: The first input layer
- *          is the priorbox layer. The rest two input layers are convolution
- *          layers for generating bbox location offset and the classification
- *          confidence.
- * - Output: The predict bounding box locations.
- */
-
-class DetectionOutputLayer : public Layer {
- public:
-  explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr) {}
-
- protected:
-  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
-
-  inline LayerPtr getLocInputLayer(size_t index) {
-    return inputLayers_[1 + index];
-  }
-
-  inline LayerPtr getConfInputLayer(size_t index) {
-    return inputLayers_[1 + inputNum_ + index];
-  }
-
- private:
-  size_t numClasses_;  // number of classes
-  size_t inputNum_;    // number of input layers
-  real nmsThreshold_;
-  real confidenceThreshold_;
-  size_t nmsTopK_;
-  size_t keepTopK_;
-  size_t backgroundId_;
-
-  size_t locSizeSum_;
-  size_t confSizeSum_;
-
-  MatrixPtr locBuffer_;
-  MatrixPtr confBuffer_;
-  MatrixPtr locTmpBuffer_;
-  MatrixPtr confTmpBuffer_;
-  MatrixPtr priorCpuValue_;
-  MatrixPtr locCpuBuffer_;
-  MatrixPtr confCpuBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DetectionUtil.cpp b/paddle/legacy/gserver/layers/DetectionUtil.cpp
deleted file mode 100644
index 0dc45e5a7..000000000
--- a/paddle/legacy/gserver/layers/DetectionUtil.cpp
+++ /dev/null
@@ -1,576 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DetectionUtil.h"
-
-namespace paddle {
-
-size_t appendWithPermute(const Matrix& inMatrix,
-                         size_t height,
-                         size_t width,
-                         size_t outTotalSize,
-                         size_t outOffset,
-                         size_t batchSize,
-                         Matrix& outMatrix,
-                         PermMode permMode) {
-  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
-  bool useGpu = inMatrix.useGpu();
-  if (permMode == kNCHWToNHWC) {
-    size_t inElementCnt = inMatrix.getElementCnt();
-    size_t channels = inElementCnt / (height * width * batchSize);
-    size_t imgSize = height * width;
-    for (size_t i = 0; i < batchSize; ++i) {
-      size_t offset = i * (outTotalSize / batchSize) + outOffset;
-      const MatrixPtr inTmp = Matrix::create(
-          const_cast<real*>(inMatrix.getData()) + i * channels * imgSize,
-          channels,
-          imgSize,
-          false,
-          useGpu);
-      MatrixPtr outTmp =
-          Matrix::create(const_cast<real*>(outMatrix.getData()) + offset,
-                         imgSize,
-                         channels,
-                         false,
-                         useGpu);
-      inTmp->transpose(outTmp, false);
-    }
-    return channels * imgSize;
-  } else {
-    LOG(FATAL) << "Unkown permute mode";
-  }
-}
-
-size_t decomposeWithPermute(const Matrix& inMatrix,
-                            size_t height,
-                            size_t width,
-                            size_t inTotalSize,
-                            size_t inOffset,
-                            size_t batchSize,
-                            Matrix& outMatrix,
-                            PermMode permMode) {
-  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
-  bool useGpu = inMatrix.useGpu();
-  if (permMode == kNHWCToNCHW) {
-    size_t outElementCnt = outMatrix.getElementCnt();
-    size_t channels = outElementCnt / (height * width * batchSize);
-    size_t imgSize = height * width;
-    for (size_t i = 0; i < batchSize; ++i) {
-      size_t offset = i * (inTotalSize / batchSize) + inOffset;
-      const MatrixPtr inTmp =
-          Matrix::create(const_cast<real*>(inMatrix.getData()) + offset,
-                         imgSize,
-                         channels,
-                         false,
-                         useGpu);
-      MatrixPtr outTmp = Matrix::create(
-          const_cast<real*>(outMatrix.getData()) + i * channels * imgSize,
-          channels,
-          imgSize,
-          false,
-          useGpu);
-      inTmp->transpose(outTmp, false);
-    }
-    return channels * imgSize;
-  } else {
-    LOG(FATAL) << "Unkown permute mode";
-  }
-}
-
-real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
-  if (bbox2.xMin > bbox1.xMax || bbox2.xMax < bbox1.xMin ||
-      bbox2.yMin > bbox1.yMax || bbox2.yMax < bbox1.yMin) {
-    return 0.0;
-  } else {
-    real interXMin = std::max(bbox1.xMin, bbox2.xMin);
-    real interYMin = std::max(bbox1.yMin, bbox2.yMin);
-    real interXMax = std::min(bbox1.xMax, bbox2.xMax);
-    real interYMax = std::min(bbox1.yMax, bbox2.yMax);
-
-    real interWidth = interXMax - interXMin;
-    real interHeight = interYMax - interYMin;
-    real interArea = interWidth * interHeight;
-
-    real bboxArea1 = bbox1.getArea();
-    real bboxArea2 = bbox2.getArea();
-
-    return interArea / (bboxArea1 + bboxArea2 - interArea);
-  }
-}
-
-void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                       const vector<real>& priorBBoxVar,
-                       const NormalizedBBox& gtBBox,
-                       vector<real>& outVec) {
-  real priorBBoxWidth = priorBBox.getWidth();
-  real priorBBoxHeight = priorBBox.getHeight();
-  real priorBBoxCenterX = priorBBox.getCenterX();
-  real priorBBoxCenterY = priorBBox.getCenterY();
-
-  real gtBBoxWidth = gtBBox.getWidth();
-  real gtBBoxHeight = gtBBox.getHeight();
-  real gtBBoxCenterX = gtBBox.getCenterX();
-  real gtBBoxCenterY = gtBBox.getCenterY();
-
-  outVec.clear();
-  outVec.push_back((gtBBoxCenterX - priorBBoxCenterX) / priorBBoxWidth /
-                   priorBBoxVar[0]);
-  outVec.push_back((gtBBoxCenterY - priorBBoxCenterY) / priorBBoxHeight /
-                   priorBBoxVar[1]);
-  outVec.push_back(std::log(std::fabs(gtBBoxWidth / priorBBoxWidth)) /
-                   priorBBoxVar[2]);
-  outVec.push_back(std::log(std::fabs(gtBBoxHeight / priorBBoxHeight)) /
-                   priorBBoxVar[3]);
-}
-
-NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                                 const vector<real>& priorBBoxVar,
-                                 const vector<real>& locPredData) {
-  real priorBBoxWidth = priorBBox.getWidth();
-  real priorBBoxHeight = priorBBox.getHeight();
-  real priorBBoxCenterX = priorBBox.getCenterX();
-  real priorBBoxCenterY = priorBBox.getCenterY();
-
-  real decodedBBoxCenterX =
-      priorBBoxVar[0] * locPredData[0] * priorBBoxWidth + priorBBoxCenterX;
-  real decodedBBoxCenterY =
-      priorBBoxVar[1] * locPredData[1] * priorBBoxHeight + priorBBoxCenterY;
-  real decodedBBoxWidth =
-      std::exp(priorBBoxVar[2] * locPredData[2]) * priorBBoxWidth;
-  real decodedBBoxHeight =
-      std::exp(priorBBoxVar[3] * locPredData[3]) * priorBBoxHeight;
-
-  NormalizedBBox decodedBBox;
-  decodedBBox.xMin = decodedBBoxCenterX - decodedBBoxWidth / 2;
-  decodedBBox.yMin = decodedBBoxCenterY - decodedBBoxHeight / 2;
-  decodedBBox.xMax = decodedBBoxCenterX + decodedBBoxWidth / 2;
-  decodedBBox.yMax = decodedBBoxCenterY + decodedBBoxHeight / 2;
-
-  return decodedBBox;
-}
-
-void getBBoxFromPriorData(const real* priorData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec) {
-  size_t outOffset = bboxVec.size();
-  bboxVec.resize(bboxVec.size() + numBBoxes);
-  for (size_t i = 0; i < numBBoxes; ++i) {
-    NormalizedBBox bbox;
-    bbox.xMin = *(priorData + i * 8);
-    bbox.yMin = *(priorData + i * 8 + 1);
-    bbox.xMax = *(priorData + i * 8 + 2);
-    bbox.yMax = *(priorData + i * 8 + 3);
-    bboxVec[outOffset + i] = bbox;
-  }
-}
-
-void getBBoxVarFromPriorData(const real* priorData,
-                             const size_t num,
-                             vector<vector<real>>& varVec) {
-  size_t outOffset = varVec.size();
-  varVec.resize(varVec.size() + num);
-  for (size_t i = 0; i < num; ++i) {
-    vector<real> var;
-    var.push_back(*(priorData + i * 8 + 4));
-    var.push_back(*(priorData + i * 8 + 5));
-    var.push_back(*(priorData + i * 8 + 6));
-    var.push_back(*(priorData + i * 8 + 7));
-    varVec[outOffset + i] = var;
-  }
-}
-
-void getBBoxFromLabelData(const real* labelData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec) {
-  size_t outOffset = bboxVec.size();
-  bboxVec.resize(bboxVec.size() + numBBoxes);
-  for (size_t i = 0; i < numBBoxes; ++i) {
-    NormalizedBBox bbox;
-    bbox.xMin = *(labelData + i * 6 + 1);
-    bbox.yMin = *(labelData + i * 6 + 2);
-    bbox.xMax = *(labelData + i * 6 + 3);
-    bbox.yMax = *(labelData + i * 6 + 4);
-    real isDifficult = *(labelData + i * 6 + 5);
-    if (std::abs(isDifficult - 0.0) < 1e-6)
-      bbox.isDifficult = false;
-    else
-      bbox.isDifficult = true;
-    bboxVec[outOffset + i] = bbox;
-  }
-}
-
-void getBBoxFromDetectData(const real* detectData,
-                           const size_t numBBoxes,
-                           vector<real>& labelVec,
-                           vector<real>& scoreVec,
-                           vector<NormalizedBBox>& bboxVec) {
-  size_t outOffset = bboxVec.size();
-  labelVec.resize(outOffset + numBBoxes);
-  scoreVec.resize(outOffset + numBBoxes);
-  bboxVec.resize(outOffset + numBBoxes);
-  for (size_t i = 0; i < numBBoxes; ++i) {
-    labelVec[outOffset + i] = *(detectData + i * 7 + 1);
-    scoreVec[outOffset + i] = *(detectData + i * 7 + 2);
-    NormalizedBBox bbox;
-    bbox.xMin = *(detectData + i * 7 + 3);
-    bbox.yMin = *(detectData + i * 7 + 4);
-    bbox.xMax = *(detectData + i * 7 + 5);
-    bbox.yMax = *(detectData + i * 7 + 6);
-    bboxVec[outOffset + i] = bbox;
-  }
-}
-
-void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
-               const vector<NormalizedBBox>& gtBBoxes,
-               real overlapThreshold,
-               vector<int>* matchIndices,
-               vector<real>* matchOverlaps) {
-  map<size_t, map<size_t, real>> overlaps;
-  size_t numPriors = priorBBoxes.size();
-  size_t numGTs = gtBBoxes.size();
-
-  matchIndices->clear();
-  matchIndices->resize(numPriors, -1);
-  matchOverlaps->clear();
-  matchOverlaps->resize(numPriors, 0.0);
-
-  // Store the positive overlap between predictions and ground truth
-  for (size_t i = 0; i < numPriors; ++i) {
-    for (size_t j = 0; j < numGTs; ++j) {
-      real overlap = jaccardOverlap(priorBBoxes[i], gtBBoxes[j]);
-      if (overlap > 1e-6) {
-        (*matchOverlaps)[i] = std::max((*matchOverlaps)[i], overlap);
-        overlaps[i][j] = overlap;
-      }
-    }
-  }
-  // Bipartite matching
-  vector<int> gtPool;
-  for (size_t i = 0; i < numGTs; ++i) {
-    gtPool.push_back(i);
-  }
-  while (gtPool.size() > 0) {
-    // Find the most overlapped gt and corresponding predictions
-    int maxPriorIdx = -1;
-    int maxGTIdx = -1;
-    real maxOverlap = -1.0;
-    for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
-         it != overlaps.end();
-         ++it) {
-      size_t i = it->first;
-      if ((*matchIndices)[i] != -1) {
-        // The prediction already has matched ground truth or is ignored
-        continue;
-      }
-      for (size_t p = 0; p < gtPool.size(); ++p) {
-        int j = gtPool[p];
-        if (it->second.find(j) == it->second.end()) {
-          // No overlap between the i-th prediction and j-th ground truth
-          continue;
-        }
-        // Find the maximum overlapped pair
-        if (it->second[j] > maxOverlap) {
-          maxPriorIdx = (int)i;
-          maxGTIdx = (int)j;
-          maxOverlap = it->second[j];
-        }
-      }
-    }
-    if (maxPriorIdx == -1) {
-      break;
-    } else {
-      (*matchIndices)[maxPriorIdx] = maxGTIdx;
-      (*matchOverlaps)[maxPriorIdx] = maxOverlap;
-      gtPool.erase(std::find(gtPool.begin(), gtPool.end(), maxGTIdx));
-    }
-  }
-
-  // Get most overlaped for the rest prediction bboxes
-  for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
-       it != overlaps.end();
-       ++it) {
-    size_t i = it->first;
-    if ((*matchIndices)[i] != -1) {
-      // The prediction already has matched ground truth or is ignored
-      continue;
-    }
-    int maxGTIdx = -1;
-    real maxOverlap = -1;
-    for (size_t j = 0; j < numGTs; ++j) {
-      if (it->second.find(j) == it->second.end()) {
-        // No overlap between the i-th prediction and j-th ground truth
-        continue;
-      }
-      // Find the maximum overlapped pair
-      real overlap = it->second[j];
-      if (overlap > maxOverlap && overlap >= overlapThreshold) {
-        maxGTIdx = j;
-        maxOverlap = overlap;
-      }
-    }
-    if (maxGTIdx != -1) {
-      (*matchIndices)[i] = maxGTIdx;
-      (*matchOverlaps)[i] = maxOverlap;
-    }
-  }
-}
-
-pair<size_t, size_t> generateMatchIndices(
-    const Matrix& priorValue,
-    const size_t numPriorBBoxes,
-    const Matrix& gtValue,
-    const int* gtStartPosPtr,
-    const size_t seqNum,
-    const vector<vector<real>>& maxConfScore,
-    const size_t batchSize,
-    const real overlapThreshold,
-    const real negOverlapThreshold,
-    const size_t negPosRatio,
-    vector<vector<int>>* matchIndicesVecPtr,
-    vector<vector<int>>* negIndicesVecPtr) {
-  vector<NormalizedBBox> priorBBoxes;  // share same prior bboxes
-  getBBoxFromPriorData(priorValue.getData(), numPriorBBoxes, priorBBoxes);
-  size_t totalPos = 0;
-  size_t totalNeg = 0;
-  for (size_t n = 0; n < batchSize; ++n) {
-    vector<int> matchIndices;
-    vector<int> negIndices;
-    vector<real> matchOverlaps;
-    matchIndices.resize(numPriorBBoxes, -1);
-    matchOverlaps.resize(numPriorBBoxes, 0.0);
-    size_t numGTBBoxes = 0;
-    if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n];
-    if (!numGTBBoxes) {
-      matchIndicesVecPtr->push_back(matchIndices);
-      negIndicesVecPtr->push_back(negIndices);
-      continue;
-    }
-    vector<NormalizedBBox> gtBBoxes;
-    getBBoxFromLabelData(
-        gtValue.getData() + gtStartPosPtr[n] * 6, numGTBBoxes, gtBBoxes);
-
-    matchBBox(
-        priorBBoxes, gtBBoxes, overlapThreshold, &matchIndices, &matchOverlaps);
-
-    size_t numPos = 0;
-    size_t numNeg = 0;
-    for (size_t i = 0; i < matchIndices.size(); ++i)
-      if (matchIndices[i] != -1) ++numPos;
-    totalPos += numPos;
-    vector<pair<real, size_t>> scoresIndices;
-    for (size_t i = 0; i < matchIndices.size(); ++i)
-      if (matchIndices[i] == -1 && matchOverlaps[i] < negOverlapThreshold) {
-        scoresIndices.push_back(std::make_pair(maxConfScore[n][i], i));
-        ++numNeg;
-      }
-    numNeg = std::min(static_cast<size_t>(numPos * negPosRatio), numNeg);
-    std::sort(scoresIndices.begin(),
-              scoresIndices.end(),
-              sortScorePairDescend<size_t>);
-    for (size_t i = 0; i < numNeg; ++i)
-      negIndices.push_back(scoresIndices[i].second);
-    totalNeg += numNeg;
-    matchIndicesVecPtr->push_back(matchIndices);
-    negIndicesVecPtr->push_back(negIndices);
-  }
-  return std::make_pair(totalPos, totalNeg);
-}
-
-void getMaxConfidenceScores(const real* confData,
-                            const size_t batchSize,
-                            const size_t numPriorBBoxes,
-                            const size_t numClasses,
-                            const size_t backgroundId,
-                            vector<vector<real>>* maxConfScoreVecPtr) {
-  maxConfScoreVecPtr->clear();
-  for (size_t i = 0; i < batchSize; ++i) {
-    vector<real> maxConfScore;
-    for (size_t j = 0; j < numPriorBBoxes; ++j) {
-      int offset = j * numClasses;
-      real maxVal = -FLT_MAX;
-      real maxPosVal = -FLT_MAX;
-      real maxScore = 0.0;
-      for (size_t c = 0; c < numClasses; ++c) {
-        maxVal = std::max<real>(confData[offset + c], maxVal);
-        if (c != backgroundId)
-          maxPosVal = std::max<real>(confData[offset + c], maxPosVal);
-      }
-      real sum = 0.0;
-      for (size_t c = 0; c < numClasses; ++c)
-        sum += std::exp(confData[offset + c] - maxVal);
-      maxScore = std::exp(maxPosVal - maxVal) / sum;
-      maxConfScore.push_back(maxScore);
-    }
-    confData += numPriorBBoxes * numClasses;
-    maxConfScoreVecPtr->push_back(maxConfScore);
-  }
-}
-
-template <typename T>
-bool sortScorePairDescend(const pair<real, T>& pair1,
-                          const pair<real, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <>
-bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
-                          const pair<real, NormalizedBBox>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-void applyNMSFast(const vector<NormalizedBBox>& bboxes,
-                  const real* confScoreData,
-                  size_t classIdx,
-                  size_t topK,
-                  real confThreshold,
-                  real nmsThreshold,
-                  size_t numPriorBBoxes,
-                  size_t numClasses,
-                  vector<size_t>* indices) {
-  vector<pair<real, size_t>> scores;
-  for (size_t i = 0; i < numPriorBBoxes; ++i) {
-    size_t confOffset = i * numClasses + classIdx;
-    if (confScoreData[confOffset] > confThreshold)
-      scores.push_back(std::make_pair(confScoreData[confOffset], i));
-  }
-  std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend<size_t>);
-  if (topK > 0 && topK < scores.size()) scores.resize(topK);
-  while (scores.size() > 0) {
-    const size_t idx = scores.front().second;
-    bool keep = true;
-    for (size_t i = 0; i < indices->size(); ++i) {
-      if (keep) {
-        const size_t savedIdx = (*indices)[i];
-        real overlap = jaccardOverlap(bboxes[idx], bboxes[savedIdx]);
-        keep = overlap <= nmsThreshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) indices->push_back(idx);
-    scores.erase(scores.begin());
-  }
-}
-
-size_t getDetectionIndices(
-    const real* confData,
-    const size_t numPriorBBoxes,
-    const size_t numClasses,
-    const size_t backgroundId,
-    const size_t batchSize,
-    const real confThreshold,
-    const size_t nmsTopK,
-    const real nmsThreshold,
-    const size_t keepTopK,
-    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-    vector<map<size_t, vector<size_t>>>* allDetectionIndices) {
-  size_t totalKeepNum = 0;
-  for (size_t n = 0; n < batchSize; ++n) {
-    const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
-    size_t numDetected = 0;
-    map<size_t, vector<size_t>> indices;
-    size_t confOffset = n * numPriorBBoxes * numClasses;
-    for (size_t c = 0; c < numClasses; ++c) {
-      if (c == backgroundId) continue;
-      applyNMSFast(decodedBBoxes,
-                   confData + confOffset,
-                   c,
-                   nmsTopK,
-                   confThreshold,
-                   nmsThreshold,
-                   numPriorBBoxes,
-                   numClasses,
-                   &(indices[c]));
-      numDetected += indices[c].size();
-    }
-    if (keepTopK > 0 && numDetected > keepTopK) {
-      vector<pair<real, pair<size_t, size_t>>> scoreIndexPairs;
-      for (size_t c = 0; c < numClasses; ++c) {
-        const vector<size_t>& labelIndices = indices[c];
-        for (size_t i = 0; i < labelIndices.size(); ++i) {
-          size_t idx = labelIndices[i];
-          scoreIndexPairs.push_back(
-              std::make_pair((confData + confOffset)[idx * numClasses + c],
-                             std::make_pair(c, idx)));
-        }
-      }
-      std::sort(scoreIndexPairs.begin(),
-                scoreIndexPairs.end(),
-                sortScorePairDescend<pair<size_t, size_t>>);
-      scoreIndexPairs.resize(keepTopK);
-      map<size_t, vector<size_t>> newIndices;
-      for (size_t i = 0; i < scoreIndexPairs.size(); ++i) {
-        size_t label = scoreIndexPairs[i].second.first;
-        size_t idx = scoreIndexPairs[i].second.second;
-        newIndices[label].push_back(idx);
-      }
-      allDetectionIndices->push_back(newIndices);
-      totalKeepNum += keepTopK;
-    } else {
-      allDetectionIndices->push_back(indices);
-      totalKeepNum += numDetected;
-    }
-  }
-  return totalKeepNum;
-}
-
-void getDetectionOutput(const real* confData,
-                        const size_t numKept,
-                        const size_t numPriorBBoxes,
-                        const size_t numClasses,
-                        const size_t batchSize,
-                        const vector<map<size_t, vector<size_t>>>& allIndices,
-                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-                        Matrix& out) {
-  MatrixPtr outBuffer;
-  Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
-  real* bufferData = outBuffer->getData();
-  size_t count = 0;
-  for (size_t n = 0; n < batchSize; ++n) {
-    for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();
-         it != allIndices[n].end();
-         ++it) {
-      size_t label = it->first;
-      const vector<size_t>& indices = it->second;
-      const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
-      for (size_t i = 0; i < indices.size(); ++i) {
-        size_t idx = indices[i];
-        size_t confOffset = n * numPriorBBoxes * numClasses + idx * numClasses;
-        bufferData[count * 7] = n;
-        bufferData[count * 7 + 1] = label;
-        bufferData[count * 7 + 2] = (confData + confOffset)[label];
-        NormalizedBBox clippedBBox = clipBBox(decodedBBoxes[idx]);
-        bufferData[count * 7 + 3] = clippedBBox.xMin;
-        bufferData[count * 7 + 4] = clippedBBox.yMin;
-        bufferData[count * 7 + 5] = clippedBBox.xMax;
-        bufferData[count * 7 + 6] = clippedBBox.yMax;
-        ++count;
-      }
-    }
-  }
-  out.copyFrom(bufferData, numKept * 7);
-}
-
-NormalizedBBox clipBBox(const NormalizedBBox& bbox) {
-  real realOne = static_cast<real>(1.0);
-  real realZero = static_cast<real>(0.0);
-  NormalizedBBox clippedBBox;
-  clippedBBox.xMin = std::max(std::min(bbox.xMin, realOne), realZero);
-  clippedBBox.yMin = std::max(std::min(bbox.yMin, realOne), realZero);
-  clippedBBox.xMax = std::max(std::min(bbox.xMax, realOne), realZero);
-  clippedBBox.yMax = std::max(std::min(bbox.yMax, realOne), realZero);
-  return clippedBBox;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DetectionUtil.h b/paddle/legacy/gserver/layers/DetectionUtil.h
deleted file mode 100644
index c1e0bb809..000000000
--- a/paddle/legacy/gserver/layers/DetectionUtil.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <float.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/legacy/math/Matrix.h"
-
-using std::vector;
-using std::pair;
-using std::map;
-
-namespace paddle {
-
-template <typename T>
-struct BBoxBase {
-  BBoxBase(T xMin, T yMin, T xMax, T yMax)
-      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
-
-  BBoxBase() {}
-
-  T getWidth() const { return xMax - xMin; }
-
-  T getHeight() const { return yMax - yMin; }
-
-  T getCenterX() const { return (xMin + xMax) / 2; }
-
-  T getCenterY() const { return (yMin + yMax) / 2; }
-
-  T getArea() const { return getWidth() * getHeight(); }
-
-  // coordinate of bounding box
-  T xMin;
-  T yMin;
-  T xMax;
-  T yMax;
-  // whether difficult object (e.g. object with heavy occlusion is difficult)
-  bool isDifficult;
-};
-
-struct NormalizedBBox : BBoxBase<real> {
-  NormalizedBBox() : BBoxBase<real>() {}
-};
-
-enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
-
-/**
- * @brief First permute input maxtrix then append to output matrix
- */
-size_t appendWithPermute(const Matrix& inMatrix,
-                         size_t height,
-                         size_t width,
-                         size_t outTotalSize,
-                         size_t outOffset,
-                         size_t batchSize,
-                         Matrix& outMatrix,
-                         PermMode permMode);
-
-/**
- * @brief First permute input maxtrix then decompose to output
- */
-size_t decomposeWithPermute(const Matrix& inMatrix,
-                            size_t height,
-                            size_t width,
-                            size_t totalSize,
-                            size_t offset,
-                            size_t batchSize,
-                            Matrix& outMatrix,
-                            PermMode permMode);
-
-/**
- * @brief Compute jaccard overlap between two bboxes.
- * @param bbox1 The first bbox
- * @param bbox2 The second bbox
- */
-real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
-
-/**
- * @brief Compute offset parameters between prior bbox and ground truth bbox
- * and variances of prior bbox are considered
- * @param priorBBox Input prior bbox
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param gtBBox Groundtruth bbox
- * @param outVec Output vector
- */
-void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                       const vector<real>& priorBBoxVar,
-                       const NormalizedBBox& gtBBox,
-                       vector<real>& outVec);
-
-/**
- * @brief Decode prior bbox with offset parameters
- * and variances of prior bbox are considered
- * @param priorBBox Prior bbox to be decoded
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param locPredData Offset parameters
- */
-NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                                 const vector<real>& priorBBoxVar,
-                                 const vector<real>& locPredData);
-
-/**
- * @brief Extract bboxes from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param numBBoxes Number of bbox to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromPriorData(const real* priorData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract labels, scores and bboxes from detection matrix, the layout is
- * imageId | label | score | xmin | ymin | xmax | ymax
- * @param detectData Matrix of detection value
- * @param numBBoxes Number of bbox to be extracted
- * @param labelVec Label of bbox
- * @param scoreVec Score of bbox
- * @param bboxVec Append to the vector
- */
-void getBBoxFromDetectData(const real* detectData,
-                           const size_t numBBoxes,
-                           vector<real>& labelVec,
-                           vector<real>& scoreVec,
-                           vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract variances from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param num Number to be extracted
- * @param varVec Append to the vector
- */
-void getBBoxVarFromPriorData(const real* priorData,
-                             const size_t num,
-                             vector<vector<real>>& varVec);
-
-/**
- * @brief Extract bboxes from label matrix, the layout is
- * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
- * @param labelData Matrix of label value
- * @param numBBoxes Number to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromLabelData(const real* labelData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
-* @brief Match prior bbox to groundtruth bbox, the strategy is:
-1. Find the most overlaped bbox pair (prior and groundtruth)
-2. For rest of prior bboxes find the most overlaped groundtruth bbox
-* @param priorBBoxes prior bbox
-* @param gtBBoxes groundtruth bbox
-* @param overlapThreshold Low boundary of overlap (judge whether matched)
-* @param matchIndices For each prior bbox, groundtruth bbox index if matched
-otherwise -1
-* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
-*/
-void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
-               const vector<NormalizedBBox>& gtBBoxes,
-               real overlapThreshold,
-               vector<int>* matchIndices,
-               vector<real>* matchOverlaps);
-
-/**
-* @brief Generate positive bboxes and negative bboxes,
-|positive bboxes|/|negative bboxes| is negPosRatio
-* @param priorValue Prior value
-* @param numPriorBBoxes Number of prior bbox
-* @param gtValue Groundtruth value
-* @param gtStartPosPtr Since groundtruth value stored as sequence type,
-this parameter indicates start position of each record
-* @param seqNum Number of sequence
-* @param maxConfScore Classification score for prior bbox, used to mine
-negative examples
-* @param batchSize Image number
-* @param overlapThreshold Low boundary of overap
-* @param negOverlapThreshold Upper boundary of overap (judge negative example)
-* @param negPosRatio Control number of negative bboxes
-* @param matchIndicesVecPtr Save indices of matched prior bbox
-* @param negIndicesVecPtr Save indices of negative prior bbox
-*/
-pair<size_t, size_t> generateMatchIndices(
-    const Matrix& priorValue,
-    const size_t numPriorBBoxes,
-    const Matrix& gtValue,
-    const int* gtStartPosPtr,
-    const size_t seqNum,
-    const vector<vector<real>>& maxConfScore,
-    const size_t batchSize,
-    const real overlapThreshold,
-    const real negOverlapThreshold,
-    const size_t negPosRatio,
-    vector<vector<int>>* matchIndicesVecPtr,
-    vector<vector<int>>* negIndicesVecPtr);
-
-/**
- * @brief Get max confidence score for each prior bbox
- * @param confData Confidence scores, layout is
- * class1 score | class2 score | ... | classN score ...
- * @param batchSize Image number
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Classes number
- * @param backgroundId Background id
- * @param maxConfScoreVecPtr Ouput
- */
-void getMaxConfidenceScores(const real* confData,
-                            const size_t batchSize,
-                            const size_t numPriorBBoxes,
-                            const size_t numClasses,
-                            const size_t backgroundId,
-                            vector<vector<real>>* maxConfScoreVecPtr);
-
-template <typename T>
-bool sortScorePairDescend(const pair<real, T>& pair1,
-                          const pair<real, T>& pair2);
-
-template <>
-bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
-                          const pair<real, NormalizedBBox>& pair2);
-
-/**
- * @brief Do NMS for bboxes to remove duplicated bboxes
- * @param bboxes BBoxes to apply NMS
- * @param confScoreData Confidence scores
- * @param classIdx Class to do NMS
- * @param topK Number to keep
- * @param confThreshold Low boundary of confidence score
- * @param nmsThreshold Threshold of overlap
- * @param numPriorBBoxes Total number of prior bboxes
- * @param numClasses Total class number
- * @param indices Indices of high quality bboxes
- */
-void applyNMSFast(const vector<NormalizedBBox>& bboxes,
-                  const real* confScoreData,
-                  size_t classIdx,
-                  size_t topK,
-                  real confThreshold,
-                  real nmsThreshold,
-                  size_t numPriorBBoxes,
-                  size_t numClasses,
-                  vector<size_t>* indices);
-
-/**
- * @brief Get detection results which satify requirements
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param backgroundId Background class
- * @param batchSize Image number
- * @param confThreshold Threshold of class confidence
- * @param nmsTopK Used in NMS operation to keep top k bbox
- * @param nmsThreshold Used in NMS, threshold of overlap
- * @param keepTopK How many bboxes keeped in an image
- * @param allDecodedBBoxes Decoded bboxes for all images
- * @param allDetectionIndices Save detection bbox indices
- */
-size_t getDetectionIndices(
-    const real* confData,
-    const size_t numPriorBBoxes,
-    const size_t numClasses,
-    const size_t backgroundId,
-    const size_t batchSize,
-    const real confThreshold,
-    const size_t nmsTopK,
-    const real nmsThreshold,
-    const size_t keepTopK,
-    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
-
-/**
- * @brief Get detection results
- * @param confData Confidence scores
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param batchSize Image number
- * @param allIndices Indices of predicted bboxes
- * @param allDecodedBBoxes BBoxes decoded
- * @param out Output matrix
- * image number | label | confidence score | xMin | yMin | xMax | yMax
- */
-void getDetectionOutput(const real* confData,
-                        const size_t numKept,
-                        const size_t numPriorBBoxes,
-                        const size_t numClasses,
-                        const size_t batchSize,
-                        const vector<map<size_t, vector<size_t>>>& allIndices,
-                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-                        Matrix& out);
-
-NormalizedBBox clipBBox(const NormalizedBBox& bbox);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DotMulOperator.cpp b/paddle/legacy/gserver/layers/DotMulOperator.cpp
deleted file mode 100644
index 03d18d9b2..000000000
--- a/paddle/legacy/gserver/layers/DotMulOperator.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Operator.h"
-
-namespace paddle {
-
-/**
- * DotMulOperator takes two inputs, performs element-wise multiplication:
- * \f[
- *   out.row[i] += scale * (in1.row[i] .* in2.row[i])
- * \f]
- * where \f$.*\f$ means element-wise multiplication,
- * and scale is a config scalar, its default value is one.
- *
- * The config file api is dotmul_operator.
- */
-class DotMulOperator : public Operator {
- public:
-  DotMulOperator(const OperatorConfig& config, bool useGpu);
-  virtual void forward();
-  virtual void backward();
-};
-
-REGISTER_OPERATOR(dot_mul, DotMulOperator);
-
-DotMulOperator::DotMulOperator(const OperatorConfig& config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK_EQ(config_.input_indices_size(), 2L);
-}
-
-void DotMulOperator::forward() {
-  out_->value->addDotMul(
-      *ins_[0]->value, *ins_[1]->value, 1, config_.dotmul_scale());
-}
-
-void DotMulOperator::backward() {
-  const MatrixPtr& inV0 = ins_[0]->value;
-  const MatrixPtr& inV1 = ins_[1]->value;
-  const MatrixPtr& inG0 = ins_[0]->grad;
-  const MatrixPtr& inG1 = ins_[1]->grad;
-
-  if (inG0) {
-    inG0->addDotMul(*out_->grad, *inV1, 1, config_.dotmul_scale());
-  }
-  if (inG1) {
-    inG1->addDotMul(*out_->grad, *inV0, 1, config_.dotmul_scale());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DotMulProjection.cpp b/paddle/legacy/gserver/layers/DotMulProjection.cpp
deleted file mode 100644
index d77803876..000000000
--- a/paddle/legacy/gserver/layers/DotMulProjection.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * DotMulProjection performs element-wise multiplication with weight:
- * \f[
- *   out.row[i] += in.row[i] .* weight
- * \f]
- * where \f$.*\f$ means element-wise multiplication.
- *
- * The config file api is dotmul_projection.
- */
-class DotMulProjection : public Projection {
- public:
-  DotMulProjection(const ProjectionConfig& config,
-                   const ParameterPtr& parameter,
-                   bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  /// shared memory with parameter
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(dot_mul, DotMulProjection);
-
-DotMulProjection::DotMulProjection(const ProjectionConfig& config,
-                                   const ParameterPtr& parameter,
-                                   bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  weight_.reset(new Weight(1LU, config.output_size(), parameter));
-}
-
-void DotMulProjection::forward() {
-  out_->value->addDotMulMMV(*in_->value, *(weight_->getW()));
-}
-
-void DotMulProjection::backward(const UpdateCallback& callback) {
-  /* Calculate the W-gradient for the current layer */
-  if (weight_->getWGrad()) {
-    weight_->getWGrad()->addDotMulVMM(*out_->grad, *in_->value);
-  }
-
-  /* Calculate the input layers error */
-  if (in_->grad) {
-    in_->grad->addDotMulMMV(*out_->grad, *(weight_->getW()));
-  }
-
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DotProdLayer.cpp b/paddle/legacy/gserver/layers/DotProdLayer.cpp
deleted file mode 100644
index 06060d93f..000000000
--- a/paddle/legacy/gserver/layers/DotProdLayer.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the dot product of two vectors.
- * Input1: vector (batchSize * dim)
- * Input2: vector (batchSize * dim)
- * Output: a matrix: (batchSize * 1)
- */
-
-class DotProdLayer : public Layer {
- public:
-  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DotProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(dot_prod, DotProdLayer);
-
-bool DotProdLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-  CHECK_EQ(1UL, getSize())
-      << "The output dimensionality of this layer should be fixed to 1.";
-
-  return true;
-}
-
-void DotProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  CHECK_EQ(inV1->getHeight(), batchSize);
-  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, 1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
-    outV->sumOfProducts(*inV0, *inV1, 1, 0);
-  }
-}
-
-void DotProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  {
-    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
-
-    if (inG0) {
-      inG0->addRowScale(0, *inV1, *outG);
-    }
-
-    if (inG1) {
-      inG1->addRowScale(0, *inV0, *outG);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp b/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
deleted file mode 100644
index 38671126c..000000000
--- a/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-/**
- * A layer for checking EOS for each sample:
- * - output_id = (input_id == conf.eos_id)
- *
- * The result is stored in output_.ids.
- * It is used by recurrent layer group.
- */
-class EosIdCheckLayer : public Layer {
- public:
-  explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-
-    const Argument& input = getInput(0);
-    IVector::resizeOrCreate(output_.ids, input.ids->getSize(), useGpu_);
-    output_.ids->isEqualTo(*input.ids, config_.eos_id());
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(eos_id, EosIdCheckLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.cpp b/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
deleted file mode 100644
index 8a53db380..000000000
--- a/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandConvLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DEFINE_bool(use_nnpack,
-            false,
-            "Whether to use nnpack for convolution calculation.");
-
-namespace paddle {
-
-/*
- * The calculation of the exconvt(convolution transpose (deconv) operation)
- * is a swap of forward and backward of the calculation of exconv.
- * */
-REGISTER_LAYER(exconv, ExpandConvLayer);
-REGISTER_LAYER(exconvt, ExpandConvLayer);
-
-inline bool isDepthwiseConv(int channels, int groups) {
-  return channels == groups;
-}
-
-bool ExpandConvLayer::init(const LayerMap &layerMap,
-                           const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ConvBaseLayer::init(layerMap, parameterMap);
-
-  int index = 0;
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    /* Consistent caffe mode for multiple input */
-    caffeMode_ = conf.caffe_mode();
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[index] * filterChannels_[index];
-    width = (!isDeconv_) ? numFilters_ : channels_[index];
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-    index++;
-  }
-
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ = std::unique_ptr<Weight>(
-          new Weight(1, numFilters_, biasParameter_, 0));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_, 0));
-    }
-  }
-
-  getOutputSize();
-
-  size_t numInputs = config_.inputs_size();
-  inputShape_.resize(numInputs);
-  filterShape_.resize(numInputs);
-  outputShape_.resize(numInputs);
-
-  std::string convType;
-  std::string convGradInputType;
-  std::string convGradFilterType;
-
-  for (int i = 0; i < config_.inputs_size(); i++) {
-    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
-    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    std::vector<size_t> dilations = {(size_t)dilationY_[i],
-                                     (size_t)dilation_[i]};
-
-    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
-
-    // Convolution Layer uses the GemmConv function by default.
-    convType = "GemmConv";
-    convGradInputType = "GemmConvGradInput";
-    convGradFilterType = "GemmConvGradFilter";
-
-    // If depth wise convolution and useGpu == true
-    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
-      convType = "DepthwiseConv";
-      convGradInputType = "DepthwiseConvGradInput";
-      convGradFilterType = "DepthwiseConvGradFilter";
-    }
-
-    // If depth wise convolution and useGpu == false and ARM-NEON
-    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      if ((filterSize_[i] == filterSizeY_[i]) &&
-          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
-          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
-          !useDilation) {
-        convType = "NeonDepthwiseConv";
-      }
-#endif
-    }
-
-    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
-      createFunction(forward_,
-                     "NNPACKConv",
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("groups", (size_t)groups_[i])
-                         .set("algo", std::string("auto")));
-    } else {
-      createFunction(forward_,
-                     !isDeconv_ ? convType : convGradInputType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-
-      createFunction(backward_,
-                     !isDeconv_ ? convGradInputType : convType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-
-      createFunction(backward_,
-                     convGradFilterType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-    }
-  }
-  return true;
-}
-
-size_t ExpandConvLayer::getOutputSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  size_t layerSize = ConvBaseLayer::calOutputSize();
-  return layerSize;
-}
-
-// i is the index of input layers
-#define BACKWARD_INPUT(i, inputs, outputs) \
-  backward_[2 * i]->calc(inputs, outputs)
-#define BACKWARD_FILTER(i, inputs, outputs) \
-  backward_[2 * i + 1]->calc(inputs, outputs)
-
-void ExpandConvLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getOutputSize());
-
-  // Calculate the shape of the input, output, and filter.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    inputShape_[i] = TensorShape({(size_t)batchSize,
-                                  (size_t)channels_[i],
-                                  (size_t)imgSizeH_[i],
-                                  (size_t)imgSizeW_[i]});
-    filterShape_[i] =
-        TensorShape({(size_t)groups_[i],
-                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
-                                : (size_t)channels_[i] / groups_[i],
-                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
-                                : (size_t)numFilters_ / groups_[i],
-                     (size_t)filterSizeY_[i],
-                     (size_t)filterSize_[i]});
-    outputShape_[i] = TensorShape({(size_t)batchSize,
-                                   (size_t)numFilters_,
-                                   (size_t)outputH_[i],
-                                   (size_t)outputW_[i]});
-  }
-
-  // Calculate the output value.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getInputValue(i), inputShape_[i]);
-    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-    outputs.addArg(*getOutputValue(),
-                   outputShape_[i],
-                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
-
-    forward_[i]->calc(inputs, outputs);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get()) {
-    output_.value->addBias(*biases_->getW(), 1.0, sharedBiases_);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void ExpandConvLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  MatrixPtr outGrad = getOutputGrad();
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBiases_);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // Calculate the input grad and filter grad.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    if (getInputGrad(i)) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
-      BACKWARD_INPUT(i, inputs, outputs);
-    }
-
-    if (weights_[i]->getWGrad()) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      if (!isDeconv_) {
-        inputs.addArg(*getOutputGrad(), outputShape_[i]);
-        inputs.addArg(*getInputValue(i), inputShape_[i]);
-      } else {
-        inputs.addArg(*getInputValue(i), inputShape_[i]);
-        inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      }
-      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
-      BACKWARD_FILTER(i, inputs, outputs);
-
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.h b/paddle/legacy/gserver/layers/ExpandConvLayer.h
deleted file mode 100644
index c0eff3ab0..000000000
--- a/paddle/legacy/gserver/layers/ExpandConvLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- *
- * The config file api is img_conv_layer.
- */
-
-class ExpandConvLayer : public ConvBaseLayer {
- public:
-  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-
-  ~ExpandConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  size_t getOutputSize();
-
- protected:
-  std::vector<TensorShape> inputShape_;
-  std::vector<TensorShape> filterShape_;
-  std::vector<TensorShape> outputShape_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.cpp b/paddle/legacy/gserver/layers/ExpandLayer.cpp
deleted file mode 100644
index 074fbab8e..000000000
--- a/paddle/legacy/gserver/layers/ExpandLayer.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(expand, ExpandLayer);
-
-bool ExpandLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 2UL);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // which sequence type of input[0]
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void ExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  // Expand layer should have exactly 2 input, one for data, one for size
-  CHECK_EQ(2U, inputLayers_.size());
-
-  // using two input:
-  // * first one for data;
-  // * second one only for sequence info
-  const Argument& shapeInput = getInput(1);
-  const Argument& dataInput = getInput(0);
-  size_t outputBatchSize = shapeInput.getBatchSize();
-  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
-                              : shapeInput.sequenceStartPositions;
-  size_t numSequences = startPositions->getSize() - 1;
-  const int* starts = startPositions->getData(false);
-
-  CHECK_EQ(starts[numSequences], shapeInput.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input[1] must hasSubseq
-    CHECK_EQ(shapeInput.hasSubseq(), 1UL);
-    CHECK_EQ(dataInput.getNumSequences(), shapeInput.getNumSequences());
-  } else {
-    CHECK_EQ(dataInput.getBatchSize(), shapeInput.getNumSequences());
-  }
-
-  // set output sequence info as shape sequence
-  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
-  if (shapeInput.hasSubseq()) {
-    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
-  }
-
-  // reserve output: Expand output to batchsize of sequence data.
-  reserveOutput(outputBatchSize, dataInput.value->getWidth());
-
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
-  int* expandStarts = expandStartsPos_->getMutableData(false);
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-    for (int j = 0; j < sequenceLength; j++) {
-      expandStarts[starts[sequenceId] + j] = sequenceId;
-    }
-  }
-
-  outputValue->copyByRowIndex(*inputValue,
-                              *expandStartsPos_->getVector(useGpu_));
-
-  if (biases_.get() != NULL) {
-    outputValue->addBias(*(biases_->getW()), 1);
-  }
-}
-
-void ExpandLayer::backward(const UpdateCallback& callback) {
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  if (!getInputGrad(0)) return;
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
-                              : getInput(1).sequenceStartPositions;
-  size_t numSequences = cpuSeqStartPos->getSize() - 1;
-  const int* starts = cpuSeqStartPos->getData(false);
-
-  CHECK_EQ(inputGrad->getWidth(), outputGrad->getWidth());
-  CHECK_EQ(outputGrad->getHeight(), (size_t)starts[numSequences]);
-
-  AsyncGpuBlock asyncGpuBlock;
-
-  // sum to get the grad
-  real scale = 1;
-  for (size_t sequenceId = 0; sequenceId < numSequences; sequenceId++) {
-    // TODO(Dangqingqing) optimization for GPU
-    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-    if (sequenceLength == 0) {
-      // empty sequence
-      continue;
-    }
-    MatrixPtr copyData = inputGrad->subMatrix(sequenceId, 1);
-    copyData->collectBias(
-        *outputGrad->subMatrix(starts[sequenceId], sequenceLength), scale);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.h b/paddle/legacy/gserver/layers/ExpandLayer.h
deleted file mode 100644
index 75a1ec756..000000000
--- a/paddle/legacy/gserver/layers/ExpandLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "Expand Dense data or (sequence data where the length of each
- * sequence is one) to sequence data."
- *
- * It should have exactly 2 input, one for data, one for size:
- * - first one for data
- *   - If ExpandLevel = kNonSeq: dense data
- *   - If ExpandLevel = kSeq: sequence data where the length of each sequence is
- * one
- * - second one only for sequence info
- *   - should be sequence data with or without sub-sequence.
- *
- * And the output size is the batch size(not instances) of second input.
- *
- * The config file api is expand_layer.
- */
-
-class ExpandLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  /// if input[0] is dense data, ExpandLevel=kNonSeq;
-  /// if input[0] is sequence data, ExpandLevel=kSeq
-  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
-  /// store the ExpandLevel
-  int type_;
-  /// expanded sequenceStartPositions or subSequenceStartPositions
-  /// of input[1]
-  ICpuGpuVectorPtr expandStartsPos_;
-
- public:
-  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
deleted file mode 100644
index 6cf269fa3..000000000
--- a/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FactorizationMachineLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
-
-bool FactorizationMachineLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  factorSize_ = config_.factor_size();
-
-  /* initialize the latentVectors_ */
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t inputSize = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
-  latentVectors_ = std::unique_ptr<Weight>(
-      new Weight(inputSize, factorSize_, parameters_[0]));
-
-  return true;
-}
-
-void FactorizationMachineLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const MatrixPtr& inputV = getInputValue(0);
-
-  size_t batchSize = inputV->getHeight();
-  size_t outputSize = getSize();
-  size_t inputSize = inputLayers_[0]->getSize();
-  reserveOutput(batchSize, outputSize);
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(
-      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
-
-  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
-  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
-  inputMulFactor_->square2(*tmpOut_);
-  outV->sumRows(*tmpOut_, 0.5, 0);
-
-  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
-                                       inputV->getHeight(),
-                                       inputV->getWidth(),
-                                       inputV->getElementCnt(),
-                                       inputV->getValueType());
-    inputSquare_->copyFrom(*inputV);
-    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
-  } else {
-    Matrix::resizeOrCreate(
-        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-    inputV->square2(*inputSquare_);
-  }
-  latentVectors_->getW()->square2(*latentVectorsSquare_);
-  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
-  outV->sumRows(*tmpOut_, -0.5, 1.0);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  const MatrixPtr& inputV = getInputValue(0);
-  const MatrixPtr& oGrad = getOutputGrad();
-
-  Matrix::resizeOrCreate(
-      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
-  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
-                                         latentVectors_->getW()->getHeight(),
-                                         1,
-                                         false,
-                                         useGpu_);
-
-  /* Calculate the gradients of the latentVectors_ matrix */
-  if (latentVectors_->getWGrad()) {
-    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
-                                         inputV->getHeight(),
-                                         inputV->getWidth(),
-                                         inputV->getElementCnt());
-
-      CpuSparseMatrix* sparseInputV =
-          dynamic_cast<CpuSparseMatrix*>(inputV.get());
-      CpuSparseMatrix* sparseInputSquare =
-          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
-      CpuSparseMatrix* sparseTmpInput =
-          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
-      sparseTmpInput->copyFrom(*sparseInputV);
-
-      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
-      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
-
-      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
-      negOnes_->zeroMem();
-      negOnes_->add(-1);
-      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
-    } else {
-      Matrix::resizeOrCreate(
-          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-
-      tmpInput_->rowScale(0, *inputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
-      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
-
-      tmpSum_->sumCols(*tmpInput_, -1, 0);
-    }
-
-    latentVectors_->getWGrad()->addRowScale(
-        0, *latentVectors_->getW(), *tmpSumTrans);
-
-    /* Increasing the number of gradient */
-    latentVectors_->getParameterPtr()->incUpdate(callback);
-  }
-
-  /* Calculate the input layers gradient */
-  MatrixPtr inGrad = getInputGrad(0);
-  if (inGrad != NULL) {
-    inGrad->mul(
-        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
-    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
-    inGrad->addColScale(0, *inputV, *tmpSum_);
-    inGrad->rowScale(0, *inGrad, *oGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.h b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
deleted file mode 100644
index fc015ed72..000000000
--- a/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief The Factorization Machine models pairwise (order-2) feature
- * interactions as inner product of the learned latent vectors corresponding
- * to each input feature.
- *
- * The Factorization Machine can effectively capture feature interactions
- * especially when the input is sparse. While in principle FM can model higher
- * order feature interaction, in practice usually only order-2 feature
- * interactions are considered. The Factorization Machine Layer here only
- * computes the order-2 interations with the formula:
- *
- * \f[
- *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
- * \f]
- *
- * The detailed calculation for forward and backward can be found at this paper:
- *
- *     Factorization machines.
- *
- * The config file api is factorization_machine.
- */
-
-class FactorizationMachineLayer : public Layer {
- protected:
-  // The latent vectors, shape: (size, factorSize_)
-  // Each row of the latentVectors_ matrix is the latent vector
-  // corresponding to one input feature dimension
-  std::unique_ptr<Weight> latentVectors_;
-  // The hyperparameter that defines the dimensionality of the factorization
-  size_t factorSize_;
-
- private:
-  // Store the square values of the letent vectors matrix
-  MatrixPtr latentVectorsSquare_;
-  // Store the square values of input matrix
-  MatrixPtr inputSquare_;
-  // The result of input matrix * latent vector matrix that will be used in
-  // both forward and backward step
-  MatrixPtr inputMulFactor_;
-  // Store temporary calculation result
-  MatrixPtr tmpOut_;
-  MatrixPtr tmpSum_;
-  MatrixPtr tmpInput_;
-  // Negative identity matrix
-  MatrixPtr negOnes_;
-
- public:
-  explicit FactorizationMachineLayer(const LayerConfig& config)
-      : Layer(config) {}
-  ~FactorizationMachineLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
deleted file mode 100644
index a3fe1433e..000000000
--- a/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for expanding a batch of images to feature maps.
- * Each data of the input is a 2 dimensional matrix. Each element of the matrix
- * is replicated num_filters times to create a feature map with num_filters
- * channels.
- * - Input: Input one should be dense image data.
- * - Output: expanded fature maps.
- * \f[
- *  y.row[i] = x.row[i \mod x.width], i = 0,1,..., (x.width * num\_filters - 1)
- * \f]
- * For example, num_filters = 4:
- * @code
- *   x = [a1,a2;
- *        b1,b2]
- *   y = [a1, a2, a1, a2, a1, a2, a1, a2;
- *        b1, b2, b1, b2, b1, b2, b1, b2;]
- * @endcode
- */
-
-class FeatureMapExpandLayer : public Layer {
- private:
-  int numFilters_;
-  bool asRowVector_;
-
- public:
-  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~FeatureMapExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
-
-bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  numFilters_ = config_.num_filters();
-  asRowVector_ = config_.user_arg() != "as_col_vec";
-  return true;
-}
-
-void FeatureMapExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inputV = getInputValue(0);
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inputV->getWidth();
-  resetOutput(batchSize, imgSize * numFilters_);
-
-  MatrixPtr outputV = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        outVTmp->addRowVector(*inVTmp);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        outVTmp->addColVector(*inVTmp);
-      }
-    }
-  }
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inGrad = getInputGrad(0);
-  if (NULL == inGrad) {
-    return;
-  }
-  MatrixPtr outGrad = getOutputGrad();
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inGrad->getWidth();
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        inGradTmp->collectBias(*outGradTmp, 1);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        inGradTmp->sumRows(*outGradTmp, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle.
diff --git a/paddle/legacy/gserver/layers/FullMatrixProjection.cpp b/paddle/legacy/gserver/layers/FullMatrixProjection.cpp
deleted file mode 100644
index b9f1bc99f..000000000
--- a/paddle/legacy/gserver/layers/FullMatrixProjection.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FullMatrixProjection.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(fc, FullMatrixProjection);
-
-FullMatrixProjection::FullMatrixProjection(const ProjectionConfig& config,
-                                           const ParameterPtr& parameter,
-                                           bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  weight_.reset(
-      new Weight(config.input_size(), config.output_size(), parameter));
-}
-
-void FullMatrixProjection::forward() {
-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  out_->value->mul(*(in_->value), *(weight_->getW()), 1, 1);
-}
-
-void FullMatrixProjection::backward(const UpdateCallback& callback) {
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  if (weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-    weight_->getWGrad()->mul(
-        *(in_->value->getTranspose()), *(out_->grad), 1, 1);
-  }
-
-  // If callback does not change value, backward propagation error
-  // asynchronously, so that we can do the callback concurrently.
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */
-  if (in_->grad) {
-    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-    in_->grad->mul(*(out_->grad), *(weight_->getW()->getTranspose()), 1, 1);
-  }
-
-  hl_set_sync_flag(syncFlag);
-  if (weight_->getWGrad()) {
-    parameter_->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullMatrixProjection.h b/paddle/legacy/gserver/layers/FullMatrixProjection.h
deleted file mode 100644
index c33d02a3a..000000000
--- a/paddle/legacy/gserver/layers/FullMatrixProjection.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/legacy/utils/Stat.h"
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * FullMatrixProjection performs full matrix multiplication:
- * \f[
- *    out.row[i] += in.row[i] * weight
- * \f]
- *
- * The config file api is full_matrix_projection.
- */
-class FullMatrixProjection : public Projection {
- public:
-  FullMatrixProjection(const ProjectionConfig& config,
-                       const ParameterPtr& parameter,
-                       bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
deleted file mode 100644
index 07f4dfbe3..000000000
--- a/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(fc, FullyConnectedLayer);
-
-bool FullyConnectedLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    // Option the parameters
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-
-    // create a new weight
-    if (parameters_[i]->isSparse()) {
-      CHECK_LE(parameters_[i]->getSize(), width * height);
-    } else {
-      CHECK_EQ(parameters_[i]->getSize(), width * height);
-    }
-    Weight* w = new Weight(height, width, parameters_[i]);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void FullyConnectedLayer::prefetch() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto* sparseParam =
-        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-    if (sparseParam) {
-      MatrixPtr input = getInputValue(i);
-      sparseParam->addRows(input);
-    }
-  }
-}
-
-void FullyConnectedLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto input = getInput(i);
-    CHECK(input.value) << "The input of 'fc' layer must be matrix";
-    REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-    i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0)
-           : outV->mul(*input.value, *weights_[i]->getW(), 1, 1);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FullyConnectedLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  bool syncFlag = hl_get_sync_flag();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    /* Calculate the W-gradient for the current layer */
-    if (weights_[i]->getWGrad()) {
-      MatrixPtr input_T = getInputValue(i)->getTranspose();
-      MatrixPtr oGrad = getOutputGrad();
-      {
-        REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-        weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1);
-      }
-    }
-
-    // If callback does not change value, backprop error asynchronously so that
-    // we can do the callback concurrently.
-    hl_set_sync_flag(false);
-
-    /* Calculate the input layers error */
-    MatrixPtr preGrad = getInputGrad(i);
-    if (NULL != preGrad) {
-      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*getOutputGrad(), *weights_T, 1, 1);
-    }
-
-    hl_set_sync_flag(syncFlag);
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.h b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
deleted file mode 100644
index 7e29cac04..000000000
--- a/paddle/legacy/gserver/layers/FullyConnectedLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * A layer has full connections to all neurons in the previous layer.
- * It computes an inner product with a set of learned weights, and
- * (optionally) adds biases.
- *
- * The config file api is fc_layer.
- */
-
-class FullyConnectedLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
-  ~FullyConnectedLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void prefetch() override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
deleted file mode 100644
index bdcd445cb..000000000
--- a/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
+++ /dev/null
@@ -1,414 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GatedRecurrentLayer.h"
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(gated_recurrent, GatedRecurrentLayer);
-
-bool GatedRecurrentLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
-  CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
-  gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
-  stateWeight_.reset(new Weight(
-      getSize(), getSize(), parameters_[0], 2 * getSize() * getSize()));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
-  }
-
-  reversed_ = config_.reversed();
-  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
-
-  GruCompute::init(config_);
-  useBatch_ = true;
-
-  return true;
-}
-
-void GatedRecurrentLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed gated "
-                       "recurrent layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->zeroMem();
-
-  // TODO(hedaoyuan): support prev_batch_state
-  CHECK(!FLAGS_prev_batch_state) << "Not supported";
-
-  useBatch_ = false;
-}
-
-void GatedRecurrentLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1)
-      << "one matrix is expected for GatedRecurrentLayer state";
-  prevOutput_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr GatedRecurrentLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-  res->value[0]->copyFrom(*prevOutput_);
-  return res;
-}
-
-void GatedRecurrentLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("GruFwTimer", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize() * 3, input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  // batchSize = length of total frames in a batch (NOT size of mini-batch)
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         getSize() * 3,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.value,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (useBatch_) {
-    forwardBatch(batchSize, numSequences, starts, input.value);
-  } else {
-    forwardSequence(batchSize, numSequences, starts, input.value);
-  }
-}
-
-void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("GruBwTimer", getName().c_str());
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         getSize() * 3,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (useBatch_) {
-    backwardBatch(batchSize, input.grad);
-  } else {
-    backwardSequence(batchSize, numSequences, starts, input.grad);
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void GatedRecurrentLayer::forwardSequence(int batchSize,
-                                          size_t numSequences,
-                                          const int* starts,
-                                          MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = nullptr;
-
-  if (reversed_) {
-    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
-    gruValue.resetOutputValue += (batchSize - 1) * getSize();
-    gruValue.outputValue += (batchSize - 1) * getSize();
-  }
-
-  auto nextFrame = [&gruValue](bool reversed, int frameSize) {
-    gruValue.prevOutValue = gruValue.outputValue;
-    if (!reversed) {
-      gruValue.gateValue += frameSize * 3;
-      gruValue.resetOutputValue += frameSize;
-      gruValue.outputValue += frameSize;
-    } else {
-      gruValue.gateValue -= frameSize * 3;
-      gruValue.resetOutputValue -= frameSize;
-      gruValue.outputValue -= frameSize;
-    }
-  };
-
-  if (!reversed_) {
-    if (prevOutput_) {
-      gruValue.prevOutValue = prevOutput_->getData();
-    }
-  }
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t n = 0; n < numSequences; ++n) {
-    int length;
-    if (!reversed_) {
-      length = starts[n + 1] - starts[n];
-    } else {
-      length = starts[numSequences - n] - starts[numSequences - n - 1];
-    }
-    for (int l = 0; l < length; ++l) {
-      if (useGpu_) {
-        GruCompute::forward<1>(gruValue, getSize());
-      } else {
-        GruCompute::forward<0>(gruValue, getSize());
-      }
-
-      nextFrame(reversed_, getSize());
-    }
-    if (!reversed_) {
-      if (!prevOutput_) gruValue.prevOutValue = nullptr;
-    } else {
-      gruValue.prevOutValue = nullptr;
-    }
-  }
-
-  if (!reversed_) {
-    if (prevOutput_) {
-      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
-    }
-  }
-}
-
-void GatedRecurrentLayer::backwardSequence(int batchSize,
-                                           size_t numSequences,
-                                           const int* starts,
-                                           MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
-                                : nullptr);
-  gruGrad.gateGrad = gate_.grad->getData();
-  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
-  gruGrad.outputGrad = output_.grad->getData();
-
-  if (!reversed_) {
-    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
-    gruValue.resetOutputValue += (batchSize - 1) * getSize();
-    gruValue.outputValue += (batchSize - 1) * getSize();
-    gruGrad.gateGrad += (batchSize - 1) * getSize() * 3;
-    gruGrad.resetOutputGrad += (batchSize - 1) * getSize();
-    gruGrad.outputGrad += (batchSize - 1) * getSize();
-    gruValue.prevOutValue = gruValue.outputValue - getSize();
-    gruGrad.prevOutGrad = gruGrad.outputGrad - getSize();
-  } else {
-    gruValue.prevOutValue = gruValue.outputValue + getSize();
-    gruGrad.prevOutGrad = gruGrad.outputGrad + getSize();
-  }
-
-  auto nextFrame = [&gruValue, &gruGrad](bool reversed, int frameSize) {
-    if (reversed) {
-      gruValue.gateValue += frameSize * 3;
-      gruValue.resetOutputValue += frameSize;
-      gruValue.outputValue += frameSize;
-      gruGrad.gateGrad += frameSize * 3;
-      gruGrad.resetOutputGrad += frameSize;
-      gruGrad.outputGrad += frameSize;
-      gruValue.prevOutValue = gruValue.outputValue + frameSize;
-      gruGrad.prevOutGrad = gruGrad.outputGrad + frameSize;
-    } else {
-      gruValue.gateValue -= frameSize * 3;
-      gruValue.resetOutputValue -= frameSize;
-      gruValue.outputValue -= frameSize;
-      gruGrad.gateGrad -= frameSize * 3;
-      gruGrad.resetOutputGrad -= frameSize;
-      gruGrad.outputGrad -= frameSize;
-      gruValue.prevOutValue = gruValue.outputValue - frameSize;
-      gruGrad.prevOutGrad = gruGrad.outputGrad - frameSize;
-    }
-  };
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t n = 0; n < numSequences; ++n) {
-      int length;
-      if (reversed_) {
-        length = starts[n + 1] - starts[n];
-      } else {
-        length = starts[numSequences - n] - starts[numSequences - n - 1];
-      }
-      for (int l = 0; l < length; ++l) {
-        if (l == length - 1) {
-          gruValue.prevOutValue = nullptr;
-          gruGrad.prevOutGrad = nullptr;
-        }
-        if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize());
-        } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize());
-        }
-        nextFrame(reversed_, getSize());
-      }
-    }
-  }
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, 1);
-  }
-}
-
-void GatedRecurrentLayer::forwardBatch(int batchSize,
-                                       size_t numSequences,
-                                       const int* starts,
-                                       MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str());
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
-
-  batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  {
-    int numBatch = batchValue_->getNumBatch();
-    int curBatchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = 0; n < numBatch; n++) {
-      MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
-      gruValue.outputValue = outputValueTmp->getData();
-      gruValue.gateValue =
-          (batchValue_->getBatchValue(*gate_.value, n))->getData();
-      gruValue.resetOutputValue =
-          (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
-
-      curBatchSize = outputValueTmp->getHeight();
-      gruValue.prevOutValue =
-          (n == 0
-               ? nullptr
-               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
-
-      {
-        if (useGpu_) {
-          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
-        } else {
-          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
-        }
-      }
-    }
-  }
-  { batchValue_->copyBackSeq(*output_.value); }
-}
-
-void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
-                                : nullptr);
-
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  { batchGrad_->copyFromSeq(*output_.grad); }
-
-  {
-    int numBatch = batchGrad_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      gruValue.gateValue =
-          (batchGrad_->getBatchValue(*gate_.value, n))->getData();
-      gruValue.resetOutputValue =
-          (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
-
-      MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n);
-      gruGrad.outputGrad = outputGradTmp->getData();
-      gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData();
-      gruGrad.resetOutputGrad =
-          (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData();
-
-      {
-        batchSize = outputGradTmp->getHeight();
-        gruValue.prevOutValue =
-            (n == 0
-                 ? nullptr
-                 : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
-        gruGrad.prevOutGrad =
-            (n == 0 ? nullptr
-                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
-
-        if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
-        } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
-        }
-      }
-    }
-  }
-
-  if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.h b/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
deleted file mode 100644
index 8bbf01ce2..000000000
--- a/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GruCompute.h"
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Please refer to "Junyoung Chung, Empirical Evaluation
- * of Gated Recurrent Neural Networks on Sequence Modeling".
- *
- * GatedRecurrentLayer takes 1 input layer with size * 3.
- * Input layer is diveded into 3 equal parts: (xz_t, xr_t, xi_t).
- * parameter and biasParameter is also diveded into 3 equal parts:
- *   - parameter consists of (U_z, U_r, U)
- *   - baisParameter consists of (bias_z, bias_r, bias_o)
- *
- * \f[
- * update \ gate: z_t = actGate(xz_t + U_z * h_{t-1} + bias_z) \\
- * reset \ gate: r_t = actGate(xr_t + U_r * h_{t-1} + bias_r) \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, h_{t-1}) + bias_o) \\
- * hidden \ activation: h_t = dot((1-z_t), h_{t-1}) + dot(z_t, {h}_t) \\
- * \f]
- *
- * @note
- * - dot denotes "element-wise multiplication".
- * - actNode is defined by config active_type
- * - actGate is defined by config actvie_gate_type
- *
- * The config file is grumemory.
- */
-
-class GatedRecurrentLayer : public Layer, public GruCompute {
- public:
-  explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int* starts,
-                       MatrixPtr inputValue);
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int* starts,
-                        MatrixPtr inputGrad);
-
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int* starts,
-                    MatrixPtr inputValue);
-  void backwardBatch(int batchSize, MatrixPtr inputGrad);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> gateWeight_;
-  std::unique_ptr<Weight> stateWeight_;
-  std::unique_ptr<Weight> bias_;
-
-  Argument gate_;
-  Argument resetOutput_;
-
-  bool reversed_;
-  bool useBatch_;
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-  std::unique_ptr<ActivationFunction> activationGate_;
-
-  MatrixPtr prevOutput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GetOutputLayer.cpp b/paddle/legacy/gserver/layers/GetOutputLayer.cpp
deleted file mode 100644
index 7c1e3c407..000000000
--- a/paddle/legacy/gserver/layers/GetOutputLayer.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-class GetOutputLayer : public Layer {
- public:
-  explicit GetOutputLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~GetOutputLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    if (!Layer::init(layerMap, parameterMap)) return false;
-    CHECK_EQ(1U, inputLayers_.size());
-    CHECK_NE(inputArgument_[0], "");
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    output_ = getPrev(0)->getOutput(inputArgument_[0]);
-  }
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
-REGISTER_LAYER(get_output, GetOutputLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruCompute.cpp b/paddle/legacy/gserver/layers/GruCompute.cpp
deleted file mode 100644
index adad6285b..000000000
--- a/paddle/legacy/gserver/layers/GruCompute.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-#include "hl_recurrent_apply.cuh"
-#include "paddle/legacy/function/GruFunctor.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-void GruCompute::init(LayerConfig &config) {
-  activeNode_ = hlActiveType(config.active_type());
-  activeGate_ = hlActiveType(config.active_gate_type());
-}
-
-template <>
-void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
-  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
-                                             hppl::forward::gru_finalOutput(),
-                                             value,
-                                             frameSize,
-                                             batchSize,
-                                             activeNode_,
-                                             activeGate_);
-}
-
-template <>
-void GruCompute::backward<0>(hl_gru_value value,
-                             hl_gru_grad grad,
-                             int frameSize,
-                             int batchSize) {
-  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
-      hppl::backward::gru_stateGrad(),
-      hppl::backward::gru_resetGrad(),
-      value,
-      grad,
-      frameSize,
-      batchSize,
-      activeNode_,
-      activeGate_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruCompute.cu b/paddle/legacy/gserver/layers/GruCompute.cu
deleted file mode 100644
index 54be6b804..000000000
--- a/paddle/legacy/gserver/layers/GruCompute.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-
-#include "hl_recurrent_apply.cuh"
-
-namespace paddle {
-
-template <>
-void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
-  hl_gpu_gru_forward(hppl::forward::gru_resetOutput(),
-                     hppl::forward::gru_finalOutput(),
-                     value,
-                     frameSize,
-                     batchSize,
-                     activeNode_,
-                     activeGate_);
-}
-
-template <>
-void GruCompute::backward<1>(hl_gru_value value,
-                             hl_gru_grad grad,
-                             int frameSize,
-                             int batchSize) {
-  hl_gpu_gru_backward(hppl::backward::gru_stateGrad(),
-                      hppl::backward::gru_resetGrad(),
-                      value,
-                      grad,
-                      frameSize,
-                      batchSize,
-                      activeNode_,
-                      activeGate_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruCompute.h b/paddle/legacy/gserver/layers/GruCompute.h
deleted file mode 100644
index 6feea7aca..000000000
--- a/paddle/legacy/gserver/layers/GruCompute.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-class GruCompute {
- public:
-  void init(LayerConfig &config);
-
-  template <bool useGpu>
-  void forward(hl_gru_value value, int frameSize, int batchSize = 1);
-
-  template <bool useGpu>
-  void backward(hl_gru_value value,
-                hl_gru_grad grad,
-                int frameSize,
-                int batchSize = 1);
-
- public:
-  hl_activation_mode_t activeNode_;
-  hl_activation_mode_t activeGate_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruStepLayer.cpp b/paddle/legacy/gserver/layers/GruStepLayer.cpp
deleted file mode 100644
index 2480e42d6..000000000
--- a/paddle/legacy/gserver/layers/GruStepLayer.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-#include "Layer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief GruStepLayer is like GatedRecurrentLayer, but used in recurrent
- * layer group. GruStepLayer takes 2 input layer.
- * - input[0] with size * 3 and diveded into 3 equal parts: (xz_t, xr_t, xi_t).
- * - input[1] with size: {prev_out}.
- *
- * parameter and biasParameter is also diveded into 3 equal parts:
- * - parameter consists of (U_z, U_r, U)
- * - baisParameter consists of (bias_z, bias_r, bias_o)
- *
- * \f[
- * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
- * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o)
- * \\
- * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
- * \f]
- *
- * @note
- *   - dot denotes "element-wise multiplication".
- *   - actNode is defined by config active_type
- *   - actGate is defined by config actvie_gate_type
- *
- * The config file api if gru_step_layer.
- */
-class GruStepLayer : public Layer, public GruCompute {
- protected:
-  Argument gate_;
-  Argument resetOutput_;
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
- public:
-  explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~GruStepLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(gru_step, GruStepLayer);
-
-bool GruStepLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(2U, inputLayers_.size());
-
-  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
-
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
-  }
-
-  GruCompute::init(config_);
-  return true;
-}
-
-void GruStepLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("GruStepFwTime", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const Argument& prevOutput = getInput(1);
-  CHECK_EQ(getSize() * 3, input.value->getWidth());
-  CHECK_EQ(getSize(), prevOutput.value->getWidth());
-
-  int batchSize = input.getBatchSize();
-  resetOutput(batchSize, getSize());
-  resetSpecifyOutput(gate_,
-                     batchSize,
-                     getSize() * 3,
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  resetSpecifyOutput(resetOutput_,
-                     batchSize,
-                     getSize(),
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  gate_.value->assign(*input.value);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = weight_->getW()->getData();
-  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = prevOutput.value->getData();
-
-  if (useGpu_) {
-    GruCompute::forward<1>(gruValue, getSize(), batchSize);
-  } else {
-    GruCompute::forward<0>(gruValue, getSize(), batchSize);
-  }
-}
-
-void GruStepLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("GruStepBwTime", getName().c_str());
-
-  const Argument& input = getInput(0);
-  const Argument& prevOutput = getInput(1);
-  int batchSize = input.getBatchSize();
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = weight_->getW()->getData();
-  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = prevOutput.value->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (weight_->getWGrad()
-           ? weight_->getWGrad()->getData() + getSize() * getSize() * 2
-           : nullptr);
-
-  gruGrad.gateGrad = gate_.grad->getData();
-  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
-  gruGrad.outputGrad = output_.grad->getData();
-  if (prevOutput.grad) {
-    gruGrad.prevOutGrad = prevOutput.grad->getData();
-  } else {
-    gruGrad.prevOutGrad = nullptr;
-  }
-
-  if (useGpu_) {
-    GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
-  } else {
-    GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, 1);
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
deleted file mode 100644
index 344959940..000000000
--- a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "HierarchicalSigmoidLayer.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-REGISTER_LAYER(hsigmoid, HierarchicalSigmoidLayer);
-
-bool HierarchicalSigmoidLayer::init(const LayerMap& layerMap,
-                                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK(config_.has_num_classes()) << "num_classes must be specifed in config";
-  numClasses_ = config_.num_classes();
-  CHECK_GE(numClasses_, (size_t)2);
-  codeLength_ = findLastSet(numClasses_ - 1);
-
-  size_t height = numClasses_ - 1;
-
-  /* initialize the weightList */
-  // The last input layer is for label
-  CHECK(!parameters_.back());
-  for (size_t i = 0; i < inputLayers_.size() - 1; i++) {
-    size_t width = inputLayers_[i]->getSize();
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(biasParameter_->getSize(), numClasses_ - 1);
-    biases_.reset(new Weight(1, numClasses_ - 1, biasParameter_));
-  }
-
-  return true;
-}
-
-void HierarchicalSigmoidLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-  Matrix::resizeOrCreate(preOutput_.value,
-                         batchSize,
-                         codeLength_,
-                         /* trans */ false,
-                         false);
-  Matrix::resizeOrCreate(preOutput_.grad,
-                         batchSize,
-                         codeLength_,
-                         /* trans */ false,
-                         false);
-  IVectorPtr label = getInput(*getLabelLayer()).ids;
-  preOutput_.value->zeroMem();
-
-  if (useGpu_) {
-    Matrix::resizeOrCreate(cpuOutput_,
-                           output_.value->getHeight(),
-                           output_.value->getWidth(),
-                           /* trans */ false,
-                           false);
-    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
-    cpuLabel_->copyFrom(*label);
-    cpuOutput_->copyFrom(*output_.value);
-  } else {
-    cpuOutput_ = output_.value;
-    cpuLabel_ = label;
-  }
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuBias_,
-                             1,
-                             numClasses_ - 1,
-                             /* trans */ false,
-                             false);
-      cpuBias_->copyFrom(*biases_->getW());
-    } else {
-      cpuBias_ = biases_->getW();
-    }
-    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
-  }
-  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
-    MatrixPtr input = getInputValue(i);
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuInput_,
-                             input->getHeight(),
-                             input->getWidth(),
-                             /* trans */ false,
-                             false);
-      Matrix::resizeOrCreate(cpuWeight_,
-                             weights_[i]->getW()->getHeight(),
-                             weights_[i]->getW()->getWidth(),
-                             /* trans */ false,
-                             false);
-      cpuInput_->copyFrom(*input);
-      cpuWeight_->copyFrom(*weights_[i]->getW());
-    } else {
-      cpuInput_ = input;
-      cpuWeight_ = weights_[i]->getW();
-    }
-    preOutput_.value->mulByBitCode(
-        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
-  }
-  // keep consistent with the clipping in the following softrelu
-  preOutput_.value->clip(-40.0, 40.0);
-  preOutput_.value->sumByBitCode(numClasses_,
-                                 *cpuLabel_,
-                                 *cpuOutput_,
-                                 -1);  // scaleSum
-  preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
-  preOutput_.value->rowSum(*sum);
-  cpuOutput_->add(*sum);
-  if (useGpu_) {
-    output_.value->copyFrom(*cpuOutput_);
-  } else {
-    output_.value = cpuOutput_;
-  }
-}
-
-void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
-  IVectorPtr label = getInput(*getLabelLayer()).ids;
-  if (useGpu_) {
-    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
-    cpuLabel_->copyFrom(*label);
-  } else {
-    cpuLabel_ = label;
-  }
-  preOutput_.grad->one();
-  preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
-
-  if (biases_ && biases_->getWGrad()) {
-    MatrixPtr biases_grad = biases_->getWGrad();
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuBias_,
-                             1,
-                             numClasses_ - 1,
-                             /* trans */ false,
-                             false);
-      cpuBias_->copyFrom(*biases_grad);
-    } else {
-      cpuBias_ = biases_grad;
-    }
-    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
-    if (useGpu_) {
-      biases_grad->copyFrom(*cpuBias_);
-    } else {
-      biases_grad = cpuBias_;
-    }
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
-    /* Calculate the W-gradient for the current layer */
-    MatrixPtr input = getInputValue(i);
-    if (weights_[i]->getWGrad()) {
-      MatrixPtr weights_grad = weights_[i]->getWGrad();
-      if (useGpu_) {
-        Matrix::resizeOrCreate(cpuInput_,
-                               input->getHeight(),
-                               input->getWidth(),
-                               /* trans */ false,
-                               false);
-        Matrix::resizeOrCreate(cpuWeightGrad_,
-                               weights_grad->getHeight(),
-                               weights_grad->getWidth(),
-                               /* trans */ false,
-                               false);
-        cpuInput_->copyFrom(*input);
-        cpuWeightGrad_->copyFrom(*weights_grad);
-      } else {
-        cpuInput_ = input;
-        cpuWeightGrad_ = weights_grad;
-      }
-      preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
-      if (useGpu_) {
-        weights_grad->copyFrom(*cpuWeightGrad_);
-      } else {
-        weights_grad = cpuWeightGrad_;
-      }
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-
-    /* Calculate the input layers error */
-    MatrixPtr inputGrad = getInputGrad(i);
-    if (inputGrad) {
-      if (useGpu_) {
-        Matrix::resizeOrCreate(cpuInputGrad_,
-                               inputGrad->getHeight(),
-                               inputGrad->getWidth(),
-                               /* trans */ false,
-                               false);
-        Matrix::resizeOrCreate(cpuWeight_,
-                               weights_[i]->getW()->getHeight(),
-                               weights_[i]->getW()->getWidth(),
-                               /* trans */ false,
-                               false);
-        cpuInputGrad_->copyFrom(*inputGrad);
-        cpuWeight_->copyFrom(*weights_[i]->getW());
-      } else {
-        cpuInputGrad_ = inputGrad;
-        cpuWeight_ = weights_[i]->getW();
-      }
-      preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
-      if (useGpu_) {
-        inputGrad->copyFrom(*cpuInputGrad_);
-      } else {
-        inputGrad = cpuInputGrad_;
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
deleted file mode 100644
index 73ef252fd..000000000
--- a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * Organize the classes into a binary tree. At each node, a sigmoid function
- * is used to calculate the probability of belonging to the right branch.
- * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
- * Hierarchical Probabilistic Neural Network Language Model."
- *
- * Here we uses a simple way of making the binary tree.
- * Assuming the number of classes C = 6,
- * The classes are organized as a binary tree in the following way:
- *
- * @code{.py}
- * *-*-*- 2
- * | | |- 3
- * | |
- * | |-*- 4
- * |   |- 5
- * |
- * |-*- 0
- *   |- 1
- * @endcode
- *
- * where * indicates an internal node, and each leaf node represents a class.
- * - Node 0 ... C-2 are internal nodes.
- * - Node C-1 ... 2C-2 are leaf nodes.
- * - Class c is represented by leaf node \f$c+C-1\f$.
- *
- * We assign an id for each node:
- * - the id of root be 0.
- * - the left child of a node i is 2*i+1.
- * - the right child of a node i is 2*i+2.
- *
- * It's easy to see that:
- * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
- * - the j-th level ancestor of node i is
- * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
- * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
- *
- * The config file api is hsigmod_layer.
- */
-class HierarchicalSigmoidLayer : public Layer {
- public:
-  explicit HierarchicalSigmoidLayer(const LayerConfig& config)
-      : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  /**
-   * The last of inputs is label layer.
-   */
-  LayerPtr getLabelLayer() { return inputLayers_.back(); }
-
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-  /// number of classes
-  size_t numClasses_;
-  /// codeLength_ = \f$1 + \left\lfloor log_{2}(numClasses-1)\right\rfloor\f$
-  int codeLength_;
-  /// temporary result of output_
-  Argument preOutput_;
-
-  /// The temporary variables in CPU memory.
-  MatrixPtr cpuWeight_;
-  MatrixPtr cpuWeightGrad_;
-  MatrixPtr cpuInput_;
-  MatrixPtr cpuInputGrad_;
-  MatrixPtr cpuBias_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/IdentityProjection.cpp b/paddle/legacy/gserver/layers/IdentityProjection.cpp
deleted file mode 100644
index f707642e0..000000000
--- a/paddle/legacy/gserver/layers/IdentityProjection.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * IdentityProjection performs addition:
- * \f[
- *   out.row[i] += in.row[i]
- * \f]
- *
- * The config file api is identity_projection.
- */
-class IdentityProjection : public Projection {
- public:
-  IdentityProjection(const ProjectionConfig& config,
-                     const ParameterPtr& parameter,
-                     bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-};
-
-REGISTER_PROJECTION(identity, IdentityProjection);
-
-/**
- * Constructed function.
- * @note IdentityProjection should not have any parameter.
- */
-IdentityProjection::IdentityProjection(const ProjectionConfig& config,
-                                       const ParameterPtr& parameter,
-                                       bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'identity' projection should not have any parameter";
-}
-
-void IdentityProjection::forward() { out_->value->add(*in_->value); }
-
-void IdentityProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    in_->grad->add(*out_->grad);
-  }
-}
-
-/**
- * IdentityOffsetProjection likes IdentityProjection, but layer size may be
- * smaller
- * than input size. It selects dimensions [offset, offset+layer_size) from input
- * to
- * perform addition:
- * \f[
- *   out.row[i] += in.row[i + \textrm{offset}]
- * \f]
- *
- * The config file api is identity_projection.
- */
-class IdentityOffsetProjection : public Projection {
- public:
-  IdentityOffsetProjection(const ProjectionConfig& config,
-                           const ParameterPtr& parameter,
-                           bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-};
-
-REGISTER_PROJECTION(identity_offset, IdentityOffsetProjection);
-
-/**
- * Constructed function.
- * @note IdentityOffsetProjection should not have any parameter.
- */
-IdentityOffsetProjection::IdentityOffsetProjection(
-    const ProjectionConfig& config, const ParameterPtr& parameter, bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'identity_offset' projection "
-                       "should not have any parameter";
-  CHECK_LE(config.output_size() + config.offset(), config.input_size());
-}
-
-void IdentityOffsetProjection::forward() {
-  out_->value->addAtOffset(*in_->value, config_.offset());
-}
-
-void IdentityOffsetProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    in_->grad->addAtOffset(*out_->grad, config_.offset());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/InterpolationLayer.cpp b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
deleted file mode 100644
index ed2294e8a..000000000
--- a/paddle/legacy/gserver/layers/InterpolationLayer.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for linear interpolation with two inputs,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
- * \f]
- * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
- * \f$w\f$ is (batchSize x 1) weight vector,
- * and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is interpolation_layer.
- */
-
-class InterpolationLayer : public Layer {
- protected:
-  /// weightLast = 1 - weight
-  MatrixPtr weightLast_;
-  MatrixPtr tmpMatrix;
-
- public:
-  explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~InterpolationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(interpolation, InterpolationLayer);
-
-bool InterpolationLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(3U, inputLayers_.size());
-
-  return true;
-}
-
-void InterpolationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(dataDim, inV2->getWidth());
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(batchSize, inV2->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(weightLast_, batchSize, 1, false, useGpu_);
-  weightLast_->one();
-  weightLast_->sub(*weightV);
-
-  REGISTER_TIMER_INFO("FwInterpTimer", getName().c_str());
-  // outV = inV1 * weight + inV2 * weightLast
-  outV->addRowScale(0, *inV1, *weightV);
-  outV->addRowScale(0, *inV2, *weightLast_);
-}
-
-void InterpolationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr inG2 = getInputGrad(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  REGISTER_TIMER_INFO("BwInterpTimer", getName().c_str());
-
-  if (inG0) {
-    Matrix::resizeOrCreate(tmpMatrix, batchSize, dataDim, false, useGpu_);
-
-    // inG0 += outG .* (inV1 - inV2)
-    tmpMatrix->sub(*inV1, *inV2);
-    inG0->rowDotMul(0, *outG, *tmpMatrix);
-  }
-
-  if (inG1) {
-    // inG1 += outG * weight
-    inG1->addRowScale(0, *outG, *weightV);
-  }
-
-  if (inG2) {
-    // inG2 += outG * weightLast
-    inG2->addRowScale(0, *outG, *weightLast_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
deleted file mode 100644
index 7fd25954e..000000000
--- a/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-class KmaxSeqScoreLayer : public Layer {
- private:
-  MatrixPtr scores_;
-  size_t beamSize_;
-  void kmaxScorePerSeq(const real* score,
-                       real* sortedRes,
-                       const ICpuGpuVectorPtr seqStartPos);
-
- public:
-  explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer);
-
-bool KmaxSeqScoreLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  bool ret = Layer::init(layerMap, parameterMap);
-  CHECK_EQ(1U, inputLayers_.size());
-
-  beamSize_ = config_.beam_size();
-  CHECK_GE(beamSize_, 1U);
-
-  setNeedSequenceInfo(false);
-  setNeedGradient(false);
-  return ret;
-}
-
-void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores,
-                                        real* sortedIds,
-                                        const ICpuGpuVectorPtr seqStartPos) {
-  int* starts = seqStartPos->getMutableData(false);
-  std::vector<real> indices;
-  for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) {
-    int seqLen = starts[i + 1] - starts[i];
-    int k = std::min(static_cast<int>(beamSize_), seqLen);
-
-    indices.resize(seqLen, 0);
-    std::iota(begin(indices), end(indices), 0.);
-    std::vector<real> tmpScore(scores + starts[i], scores + starts[i + 1]);
-    std::partial_sort(
-        begin(indices),
-        begin(indices) + k,
-        end(indices),
-        [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; });
-    memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real));
-  }
-}
-
-void KmaxSeqScoreLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const MatrixPtr inputScore = getInputValue(0);
-
-  CHECK(input.hasSeq() || input.hasSubseq())
-      << "input of " << getName()
-      << " must be a sequence or a nested sequence.";
-  CHECK_EQ(input.value->getWidth(), 1UL)
-      << "input of " << getName() << " are scores over a sequence or "
-      << "a nested sequence, so its width must be 1.";
-
-  if (useGpu_) {
-    /*
-     * currently, this Layer only runs in CPU, if the other part of the model is
-     * runing on GPU, then copy the input to this layer from GPU to CPU.
-     */
-    Matrix::resizeOrCreate(scores_,
-                           inputScore->getHeight(),
-                           1,
-                           false /* trans */,
-                           false /* useGpu */);
-    scores_->copyFrom(*inputScore);
-  } else {
-    scores_ = inputScore;
-  }
-
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but output of this layer which is some selected indices of the give
-   * sequence are actually filled with int types so that storing int types
-   * information in a real number matrix is dangerous, since real numbers will
-   * be convered to int types.
-   */
-  Matrix::resizeOrCreate(
-      output_.value,
-      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
-      beamSize_,
-      false,
-      false);
-  output_.value->one();
-  output_.value->mulScalar(-1.);
-
-  kmaxScorePerSeq(scores_->getData(),
-                  output_.value->getData(),
-                  input.hasSubseq() ? input.subSequenceStartPositions
-                                    : input.sequenceStartPositions);
-}
-
-void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.cpp b/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
deleted file mode 100644
index a3e627e57..000000000
--- a/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "L2DistanceLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(l2_distance, L2DistanceLayer);
-
-bool L2DistanceLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
-                                     << "only two inputs.";
-  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
-                           << "is fixed to be 1.";
-
-  return true;
-}
-
-void L2DistanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const auto inV1 = getInputValue(0);
-  const auto inV2 = getInputValue(1);
-
-  CHECK(inV1 && inV2);
-  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
-      << "The height of two inputs of this layer must be the same.";
-  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
-      << "The width of two inputs of this layer must be the same.";
-
-  int batchSize = inV1->getHeight();
-  int output_dim = getSize();
-  {
-    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
-    reserveOutput(batchSize, output_dim);
-    auto outV = getOutputValue();
-    CHECK(outV) << "The output matrix should not be null.";
-
-    Matrix::resizeOrCreate(
-        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
-
-    inputSub_->assign(*inV1);
-    inputSub_->sub(*inV2);
-    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
-    outV->sqrt2(*outV);
-  }
-}
-
-void L2DistanceLayer::backward(const UpdateCallback& callback) {
-  const auto outG = getOutputGrad();
-  const auto outV = getOutputValue();
-  CHECK(outG && outV);
-
-  auto inGrad1 = getInputGrad(0);
-  auto inGrad2 = getInputGrad(1);
-
-  {
-    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
-
-    if (inGrad1 || inGrad2) {
-      outV->scalarDiv(*outV, 1.);
-      outV->dotMul(*outG, *outV);
-    }
-
-    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
-
-    if (inGrad2) {
-      inputSub_->mulScalar(-1.);
-      inGrad2->addRowScale(0, *inputSub_, *outV);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.h b/paddle/legacy/gserver/layers/L2DistanceLayer.h
deleted file mode 100644
index aa8aabd9c..000000000
--- a/paddle/legacy/gserver/layers/L2DistanceLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief The layer calculates the l2 distance between two input vectors.
- * \f[
- * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim)
- * - Input2: A vector (batchSize * dataDim)
- * - Output: A vector (batchSize * 1)
- *
- * The configuration api is: l2_distance_layer.
- */
-
-class L2DistanceLayer : public Layer {
- public:
-  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
-  ~L2DistanceLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  // Store the result of subtracting Input2 from Input1 in forward computation,
-  // which will be reused in backward computation.
-  MatrixPtr inputSub_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.cpp b/paddle/legacy/gserver/layers/Layer.cpp
deleted file mode 100644
index 890d33552..000000000
--- a/paddle/legacy/gserver/layers/Layer.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Util.h"
-
-#include "CostLayer.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Error.h"
-#include "paddle/legacy/utils/Logging.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "ValidationLayer.h"
-#endif
-
-DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
-
-namespace paddle {
-
-Layer::Layer(const LayerConfig& config, bool useGpu)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(CPU_DEVICE),
-      needSequenceInfo_(true) {}
-
-bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  output_.deviceId = deviceId_;
-
-  for (auto& inputConfig : config_.inputs()) {
-    std::string inputName = inputConfig.input_layer_name();
-    LayerPtr inputLayer;
-    CHECK(mapGet(inputName, layerMap, &inputLayer))
-        << "Cannot find input layer " << inputName << " for layer "
-        << getName();
-    this->addPrev(inputLayer);
-
-    inputLayer->addOutputArgument(deviceId_);
-
-    if (inputConfig.has_input_parameter_name()) {
-      ParameterPtr parameter;
-      CHECK(
-          mapGet(inputConfig.input_parameter_name(), parameterMap, &parameter))
-          << "Cannot find input parameter "
-          << inputConfig.input_parameter_name() << " for layer " << getName();
-      parameter->incShared();
-      CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-      parameters_.push_back(parameter);
-    } else {
-      parameters_.push_back(nullptr);
-    }
-
-    if (inputConfig.has_input_layer_argument()) {
-      inputArgument_.push_back(inputConfig.input_layer_argument());
-    } else {
-      inputArgument_.push_back("");
-    }
-  }
-
-  if (config_.has_bias_parameter_name()) {
-    CHECK(mapGet(config_.bias_parameter_name(), parameterMap, &biasParameter_))
-        << "Cannot find bias parameter " << config_.bias_parameter_name()
-        << " for layer " << getName();
-    biasParameter_->incShared();
-    CHECK_EQ(biasParameter_->getDeviceId(), getDeviceId());
-  }
-
-  /* specify the activation function according to the configuration */
-  std::string action_type = config_.active_type();
-  activation_.reset(ActivationFunction::create(action_type));
-  CHECK(activation_);
-
-  initNeedFlags();
-  markInBackward_.assign(inputLayers_.size(), false);
-
-  return true;
-}
-
-ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
-
-LayerPtr Layer::create(const LayerConfig& config) {
-  std::string type = config.type();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOTE: As following types have illegal character '-',
-  // they can not use REGISTER_LAYER to registrar.
-  // Besides, to fit with old training models,
-  // they can not use '_' instead.
-  if (type == "multi-class-cross-entropy")
-    return LayerPtr(new MultiClassCrossEntropy(config));
-  else if (type == "rank-cost")
-    return LayerPtr(new RankingCost(config));
-  else if (type == "auc-validation")
-    return LayerPtr(new AucValidation(config));
-  else if (type == "pnpair-validation")
-    return LayerPtr(new PnpairValidation(config));
-#endif
-
-  return LayerPtr(registrar_.createByType(config.type(), config));
-}
-
-void Layer::resetSpecifyOutput(Argument& output,
-                               size_t height,
-                               size_t width,
-                               bool isValueClean,
-                               bool isGradClean) {
-  SetDevice device(output.deviceId);
-
-  Matrix::resizeOrCreate(
-      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
-  if (isValueClean) {
-    output.value->zeroMem();
-  }
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    Matrix::resizeOrCreate(
-        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
-    if (isGradClean) {
-      output.grad->zeroMem();
-    }
-  }
-}
-
-void Layer::resizeOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, false);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, false);
-  }
-}
-
-void Layer::reserveOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, true);
-  }
-}
-
-void Layer::resetOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, true, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, true, true);
-  }
-}
-
-void Layer::addOutputArgument(int deviceId) {
-  if (deviceId == deviceId_) {
-    output_.countIncrement();
-    return;
-  } else {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == deviceId) {
-        outputOtherDevice_[i].countIncrement();
-        return;
-      }
-    }
-  }
-
-  Argument argu;
-  argu.deviceId = deviceId;
-  outputOtherDevice_.push_back(argu);
-  outputOtherDevice_.back().countIncrement();
-}
-
-void Layer::copyOutputToOtherDevice() {
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    SetDevice device(outputOtherDevice_[i].deviceId);
-    // If outputOtherDevice_[i].value is a CpuMatrix,
-    // the copyFrom is a synchronous interface.
-    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
-    // calculations are all on HPPL_STREAM_DEFAULT,
-    // copyFrom can be an asynchronous interface.
-    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
-                                          HPPL_STREAM_DEFAULT);
-    outputOtherDevice_[i].sequenceStartPositions =
-        output_.sequenceStartPositions;
-    outputOtherDevice_[i].subSequenceStartPositions =
-        output_.subSequenceStartPositions;
-    outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-
-    outputOtherDevice_[i].notifyValueReady();
-  }
-}
-
-void Layer::waitInputValue() {
-  for (size_t i = 0; i != inputLayers_.size(); i++) {
-    if (inputLayers_[i]->getDeviceId() != deviceId_) {
-      getInput(i).waitValueReady();
-    }
-  }
-}
-
-void Layer::waitAndMergeOutputGrad() {
-  if (!output_.grad || !outputOtherDevice_.size()) {
-    return;
-  }
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    outputOtherDevice_[i].waitGradReady();
-  }
-
-  /* merge output grad */
-  size_t i = 0;
-  if (!output_.getAllCount()) {
-    output_.grad->copyFrom(*outputOtherDevice_[0].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-
-    i++;
-    if (outputOtherDevice_.size() == 1) return;
-  }
-
-  Matrix::resizeOrCreate(tmpGrad_,
-                         output_.grad->getHeight(),
-                         output_.grad->getWidth(),
-                         /* trans */ false,
-                         useGpu(output_.deviceId));
-
-  for (; i != outputOtherDevice_.size(); i++) {
-    tmpGrad_->copyFrom(*outputOtherDevice_[i].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-    output_.grad->add(*tmpGrad_);
-  }
-}
-
-void Layer::markAllInputGrad() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (!markInBackward_[i]) {
-      inputLayers_[i]->getOutput(deviceId_).notifyGradReady();
-    }
-    markInBackward_[i] = false;
-  }
-}
-
-void Layer::markInputGrad(int inputIndex) {
-  inputLayers_[inputIndex]->getOutput(deviceId_).notifyGradReady();
-  markInBackward_[inputIndex] = true;
-}
-
-void Layer::zeroGrad() {
-  CHECK(output_.grad.get() != NULL);
-  output_.grad->zeroMem();
-}
-
-void Layer::initNeedFlags() {
-  auto initFlag = [this](
-      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
-    flag = false;
-    if (biasParameter_ && biasParameter_->hasType(type)) {
-      flag = true;
-    }
-    if (!flag) {
-      for (auto& para : parameters_) {
-        if (para && para->hasType(type)) {
-          flag = true;
-          break;
-        }
-      }
-    }
-    if (!flag) {
-      for (auto& layer : inputLayers_) {
-        if ((layer.get()->*flagQueryFunc)()) {
-          flag = true;
-        }
-      }
-    }
-  };
-  initFlag(needGradient_, &Layer::needGradient, PARAMETER_GRADIENT);
-}
-
-void Layer::showOutputStats() {
-  MatrixPtr out = getOutputValue();
-  if (!out) return;
-  if (!out->getElementCnt()) {
-    LOG(INFO) << "The number of output of " << config_.name()
-              << " is 0, skip to show the statistics";
-    return;
-  }
-  MatrixPtr outSquare;
-  if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
-    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
-    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
-                                                  tmp->getWidth(),
-                                                  tmp->getElementCnt(),
-                                                  tmp->getValueType(),
-                                                  tmp->getFormat());
-  } else {
-    outSquare = out->clone();
-  }
-  outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  real mean = outSquare->getSum() / out->getElementCnt();
-  real min;
-  real max;
-  if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
-    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
-    min = tmpMat->getMin();
-    max = tmpMat->getMax();
-    tmpMat->square2();
-    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
-  } else {
-    min = outSquare->getMin();
-    max = outSquare->getMax();
-    outSquare->square2();
-  }
-  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
-  std = std > 0 ? std : 0;
-  LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
-            << ", "
-            << "std=" << std << ", "
-            << "min=" << min << ", "
-            << "max=" << max;
-}
-
-void Layer::forwardActivation() {
-  /* activation */
-  auto status = activation_->forward(output_);
-  status.check();
-
-  /* dropout */
-  if (config_.drop_rate() > 0) {
-    forwardDropOut();
-    CHECK_NE(activation_->getName(), "softmax")
-        << "Softmax activation cannot be used with Dropout";
-  }
-
-  if (FLAGS_show_layer_stat) {
-    showOutputStats();
-  }
-}
-
-void Layer::backwardActivation() {
-  /* Do error clipping */
-  if (config_.error_clipping_threshold() > 0.0f) {
-    if (FLAGS_log_error_clipping) {
-      VectorPtr outGradVec = Vector::create(
-          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
-      real maxAbsGrad = outGradVec->getAbsMax();
-      if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
-        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
-                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
-      }
-    }
-    output_.grad->clip(-config_.error_clipping_threshold(),
-                       config_.error_clipping_threshold());
-  }
-
-  /* Do dropout for delta*/
-  if (config_.drop_rate() > 0 && passType_ != PASS_TEST) {
-    MatrixPtr oGrad = getOutputGrad();
-    oGrad->dotMul(*oGrad, *dropOutMask_);
-  }
-
-  auto status = activation_->backward(output_);
-  status.check();
-}
-
-void Layer::forwardDropOut() {
-  auto& outV = getOutputValue();
-
-  if (passType_ == PASS_TRAIN) {
-    // new dropOutMask_ if dropOutMask_ is null ptr
-    Matrix::resizeOrCreate(dropOutMask_,
-                           outV->getHeight(),
-                           outV->getWidth(),
-                           false,
-                           useGpu(deviceId_));
-    dropOutMask_->randomizeUniform();  // generate a uniform random matrix
-    dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
-    outV->dotMul(*outV, *dropOutMask_);                   // dropout
-  } else if (passType_ == PASS_GC) {
-    // only initialize once
-    if (!dropOutMask_) {
-      dropOutMask_ = Matrix::create(
-          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
-      // We use cpu matrix to generate mask so that the mask
-      // will be same for both gpu version and cpu version.
-      // This will help unittest to make sure they have same result.
-      MatrixPtr tmpMask = Matrix::create(outV->getHeight(), outV->getWidth());
-      tmpMask->randomizeUniform();  // generate a uniform random matrix
-      tmpMask->biggerThanScalar(config_.drop_rate());  // random mask
-      dropOutMask_->copyFrom(*tmpMask);
-    }
-    outV->dotMul(*outV, *dropOutMask_);
-  } else {  // passType == PASS_TEST
-    outV->mulScalar(1.0 - config_.drop_rate());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.h b/paddle/legacy/gserver/layers/Layer.h
deleted file mode 100644
index a7ff76dec..000000000
--- a/paddle/legacy/gserver/layers/Layer.h
+++ /dev/null
@@ -1,512 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/function/Function.h"
-#include "paddle/legacy/gserver/activations/ActivationFunction.h"
-#include "paddle/legacy/math/CpuSparseMatrix.h"
-#include "paddle/legacy/parameter/Argument.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/Weight.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-#include "paddle/legacy/utils/Util.h"
-
-/// Macro for registering a layer type.
-/// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
-#define REGISTER_LAYER(__type_name, __class_name) \
-  static InitFunction __reg_type_##__type_name(   \
-      []() { Layer::registrar_.registerClass<__class_name>(#__type_name); })
-
-#define REGISTER_LAYER_CREATE_FUNC(__type_name, createFunction) \
-  static InitFunction __reg_type_##__type_name(                 \
-      []() { Layer::registrar_.registerClass(#__type_name, createFunction); })
-
-namespace paddle {
-
-class Layer;
-typedef std::shared_ptr<Layer> LayerPtr;
-typedef std::map<std::string, LayerPtr> LayerMap;
-class NeuralNetwork;
-
-/// layer state, used for RNN and LSTM layers
-struct LayerState {
-  std::vector<MatrixPtr> value;
-};
-typedef std::shared_ptr<LayerState> LayerStatePtr;
-
-/// Paddle device ID, MKLDNN is -2, CPU is -1
-enum PADDLE_DEVICE_ID {
-  MKLDNN_DEVICE = -2,
-  CPU_DEVICE = -1,
-};
-
-/**
- * @brief Base class for layer.
- * Define necessary variables and functions for every layer.
- */
-class Layer {
- protected:
-  /// Layer config
-  LayerConfig config_;
-  /// whether to use GPU
-  bool useGpu_;
-  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
-  int deviceId_;
-  /// Input layers
-  std::vector<LayerPtr> inputLayers_;
-  /// Argument of input layers
-  std::vector<std::string> inputArgument_;
-
-  /// Parameter for each input layer.
-  /// Parameters_[i] is nullptr if inputLayers_[i] does not need parameter.
-  std::vector<ParameterPtr> parameters_;
-
-  /// nullptr if bias is not needed.
-  ParameterPtr biasParameter_;
-
-  /// Output
-  Argument output_;
-  /// Several outputs stored on different devices, used in 'parallel_nn' case,
-  /// and record them by deviceId_.
-  /// Also used in 'use_mkldnn' case.
-  std::vector<Argument> outputOtherDevice_;
-  /// If there are several outputs, map them by each name.
-  /// MKLDNNLayer use it only to merge output grad
-  std::map<std::string, Argument*> outputMap_;
-  /// Used to merge grad on different devices.
-  MatrixPtr tmpGrad_;
-
-  std::unique_ptr<ActivationFunction> activation_;
-
-  /// Current passType, PASS_TRAIN or PASS_TEST
-  PassType passType_;
-
-  /// Random 0-1 matrix for dropOut
-  MatrixPtr dropOutMask_;
-
-  /// Whether the layer need to compute gradient
-  bool needGradient_;
-  /// Whether the layer need to compute re-sequence information
-  bool needSequenceInfo_;
-
-  /// Mark input grad in(true) or out(false) of backward function.
-  std::vector<bool> markInBackward_;
-
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-
- public:
-  /**
-   * Wait until all input value ready.
-   * Called before Layer::forward() function.
-   */
-  virtual void waitInputValue();
-
-  /**
-   * Copy layer's output_ to other device.
-   * If output layer is in other device, called after Layer::forward() function.
-   */
-  virtual void copyOutputToOtherDevice();
-
-  /**
-   * Wait until all output grad ready and merge them to output_.grad.
-   * Called before Layer::backward() function.
-   */
-  virtual void waitAndMergeOutputGrad();
-
-  /**
-   * Notify previous layer the output grad ready.
-   * Called after Layer::backward() function.
-   */
-  virtual void markAllInputGrad();
-
- protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
-  /**
-   * Notify specified layer the output grad ready.
-   * Called in the backward function.
-   * If do mark input grad in the backward function, you should to ensure
-   * that all input grad will be marked in the backward function.
-   */
-  void markInputGrad(int inputIndex);
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(size_t inputIndex) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(const Layer& inputLayer) const {
-    return inputLayer.getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer with deviceId.
-   */
-  const Argument& getInput(size_t inputIndex, int deviceId) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId);
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value with deviceId.
-   */
-  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).value;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
-  }
-
-  /**
-   * Get the forward-input label.
-   */
-  const IVectorPtr& getInputLabel(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).ids;
-  }
-
-  /**
-   * Change the size of output (value, grad).
-   * Reset to value zero if isValueClean = true,
-   * Reset to grad zero if isGradClean = true.
-   */
-  void resetSpecifyOutput(Argument& output,
-                          size_t height,
-                          size_t width,
-                          bool isValueClean,
-                          bool isGradClean);
-
-  /**
-   * Add output argument to other devices.
-   */
-  void addOutputArgument(int deviceId);
-
- public:
-  explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
-  virtual ~Layer() {}
-
-  /// Register a Layer
-  static ClassRegistrar<Layer, LayerConfig> registrar_;
-
-  /**
-   * Get the flag whether layer need to compute gradient.
-   */
-  bool needGradient() const { return needGradient_; }
-
-  /**
-   * Set the flag whether layer need to compute gradient.
-   */
-  void setNeedGradient(bool need) { needGradient_ = need; }
-
-  /**
-   * Set the flag whether layer need to re-compute sequence information,
-   * which includes sequenceStartPositions or subSequenceStartPositions.
-   */
-  void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
-
-  /**
-   * Get layer's name.
-   */
-  const std::string& getName() const { return config_.name(); }
-
-  /**
-   * Get layer's type.
-   */
-  const std::string& getType() const { return config_.type(); }
-
-  /**
-   * Get layer's size.
-   */
-  size_t getSize() const { return config_.size(); }
-
-  /**
-   * Get layer's deviceId.
-   */
-  int getDeviceId() const { return deviceId_; }
-
-  /**
-   * Add the inputLayer.
-   */
-  void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
-
-  /**
-   * Get the size of inputLayer[i].
-   */
-  const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
-
-  /**
-   * Get the forward-output value.
-   */
-  const MatrixPtr& getOutputValue() { return output_.value; }
-
-  /**
-   * Get the forward-output label.
-   */
-  const IVectorPtr& getOutputLabel() { return output_.ids; }
-
-  /**
-   * Get the backward-Loss value.
-   */
-  const MatrixPtr& getOutputGrad() { return output_.grad; }
-  /**
-   * If layer has multi-output, set output into outputMap_.
-   */
-  void setOutput(const std::string& name, Argument* output) {
-    outputMap_[name] = output;
-  }
-
-  /**
-   * Get the output map size, if layer has multi-output.
-   */
-  size_t getOutputMapSize() { return outputMap_.size(); }
-
-  /**
-   * Get the output based on layer's name.
-   */
-  Argument& getOutput(const std::string& str = "") {
-    if (str == "") {
-      return output_;
-    } else {
-      auto output = outputMap_.find(str);
-      if (output != outputMap_.end()) {
-        return *output->second;
-      } else {
-        LOG(FATAL) << "No specific output " << str;
-        return *((Argument*)nullptr);
-      }
-    }
-  }
-
-  /**
-   * Get the output based on deviceId.
-   */
-  const Argument& getOutput(int deviceId) const {
-    if (deviceId == getDeviceId()) {
-      return output_;
-    } else {
-      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-        if (outputOtherDevice_[i].deviceId == deviceId) {
-          return outputOtherDevice_[i];
-        }
-      }
-
-      LOG(FATAL) << "No specific device output ";
-      return *((Argument*)nullptr);
-    }
-  }
-
-  /**
-   * Get layer's parameters.
-   */
-  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  /**
-   * Get layer's bias-parameters.
-   */
-  const ParameterPtr& getBiasParameter() { return biasParameter_; }
-
-  /**
-   * Create pointer of layer.
-   */
-  static LayerPtr create(const LayerConfig& config);
-
-  /**
-   * Resize the output matrix size.
-   */
-  void resizeOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value to zero.
-   */
-  void reserveOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value and grad to zero.
-   */
-  void resetOutput(size_t height, size_t width);
-
-  /**
-   * Clear the gradient of output.
-   */
-  void zeroGrad();
-
-  /**
-   * Intialization.
-   * For example, adding input layers from layerMap and parameterMap.
-   */
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  /**
-   * Intialization for sub network if there has sub network.
-   * @param rootNetwork root network
-   * @param config model config
-   * @param parameterTypes parameter's type
-   * @param useGpu whether to use gpu or not
-   */
-  virtual void initSubNetwork(NeuralNetwork* rootNetwork,
-                              const ModelConfig& config,
-                              const std::vector<ParameterType>& parameterTypes,
-                              bool useGpu) {}
-
-  /**
-   * @brief Access SubNetwork Object.
-   *        If subnetwork exists, then invoke callback with subnetwrk.
-   * @param callback if sub-network is exist, the callback is invoked.
-   */
-  virtual void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) {}
-
-  /**
-   * If use sparse row matrix as parameter,
-   * prefetch feature ids in input label.
-   */
-  virtual void prefetch() {}
-
-  /**
-   * Forward propagation.
-   * All inherited implementation should call Layer::foward() function.
-   */
-  virtual void forward(PassType passType) {
-    passType_ = passType;
-    if (!inputLayers_.empty() && needSequenceInfo_) {
-      const Argument& input = getInput(0);
-      output_.sequenceStartPositions = input.sequenceStartPositions;
-      output_.subSequenceStartPositions = input.subSequenceStartPositions;
-      output_.cpuSequenceDims = input.cpuSequenceDims;
-    }
-  }
-
-  /**
-   * Reset the internal state variables.
-   * Allocate them if they have not been allocated.
-   * This function need to called before Layer::forward() for generating
-   * sequence.
-   *
-   * This is used for sequence generation. When generating sequence, the
-   * calculation at current timestamp depends on the state from previous
-   * timestamp. The model needs to keep the information about the previous
-   * timestamp in the state variables. Layers such as RecurrentLayer,
-   * LstmLayer and ContextLayer have state variables.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state.
-   * @return A copy of internal state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * Show output state.
-   */
-  void showOutputStats();
-
-  /**
-   * Backward propagation.
-   * Should only be called after Layer::forward() function.
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * One pass is finished.
-   */
-  virtual void onPassEnd() {}
-
- protected:
-  /**
-   * Forward of activation function.
-   */
-  void forwardActivation();
-  /**
-   * Backward of activation function.
-   */
-  void backwardActivation();
-  /**
-   * Forward of dropOut.
-   */
-  void forwardDropOut();
-  /**
-   * Initilize the needGradient_ flag.
-   */
-  void initNeedFlags();
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LinearChainCRF.cpp b/paddle/legacy/gserver/layers/LinearChainCRF.cpp
deleted file mode 100644
index 315fc25fa..000000000
--- a/paddle/legacy/gserver/layers/LinearChainCRF.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LinearChainCRF.h"
-#include <algorithm>
-
-namespace paddle {
-
-LinearChainCRF::LinearChainCRF(int numClasses, real* para)
-    : numClasses_(numClasses) {
-  a_ = Matrix::create(para, 1, numClasses_);
-  b_ = Matrix::create(para + numClasses_, 1, numClasses_);
-  w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_);
-
-  ones_ = Matrix::create(1, numClasses_);
-  ones_->one();
-
-  expW_ = Matrix::create(numClasses_, numClasses_);
-}
-
-// normalize x so that its sum is 1 and return the original sum;
-static real normalizeL1(real* x, int n) {
-  real sum = 0;
-  for (int i = 0; i < n; ++i) {
-    sum += x[i];
-  }
-  // Right now, we just bet that sum won't be zero. If this really happens,
-  // we will figure out what should be done then.
-  CHECK_GT(sum, 0);
-  real s = 1 / sum;
-  for (int i = 0; i < n; ++i) {
-    x[i] *= s;
-  }
-  return sum;
-}
-
-real LinearChainCRF::forward(real* x, int* s, int length) {
-  Matrix::resizeOrCreate(maxX_, length, 1);
-  Matrix::resizeOrCreate(expX_, length, numClasses_);
-  Matrix::resizeOrCreate(alpha_, length, numClasses_);
-  MatrixPtr matX = Matrix::create(x, length, numClasses_);
-  matX->rowMax(*maxX_);
-  expX_->assign(*matX);
-  // subtract max to avoid overflow or underflow
-  expX_->mul(*maxX_, *ones_, (real)-1, (real)1);
-  expX_->exp2();
-
-  real* a = a_->getData();
-  real* b = b_->getData();
-  real* w = w_->getData();
-  real* alpha = alpha_->getData();
-  real* expX = expX_->getData();
-  real* maxX = maxX_->getData();
-
-  expW_->exp2(*w_);
-  real* expW = expW_->getData();
-
-  for (int i = 0; i < numClasses_; ++i) {
-    alpha[i] = exp(a[i]) * expX[i];
-  }
-  real ll = -maxX[0] - log(normalizeL1(alpha, numClasses_));
-
-  for (int k = 1; k < length; ++k) {
-    for (int i = 0; i < numClasses_; ++i) {
-      real sum = 0;
-      for (int j = 0; j < numClasses_; ++j) {
-        sum += alpha[(k - 1) * numClasses_ + j]  // (*)
-               * expW[j * numClasses_ + i];
-      }
-      alpha[k * numClasses_ + i] = expX[k * numClasses_ + i] * sum;
-    }
-    // normalizeL1 is to avoid underflow or overflow at (*)
-    ll -= maxX[k] + log(normalizeL1(alpha + k * numClasses_, numClasses_));
-  }
-  real sum = 0;
-  for (int i = 0; i < numClasses_; ++i) {
-    sum += alpha[(length - 1) * numClasses_ + i] * exp(b[i]);
-  }
-  ll -= log(sum);
-  // Now ll is equal to -log(Z)
-
-  CHECK_LT(*std::max_element(s, s + length), numClasses_);
-  // Calculate the nominator part, which depends on s
-  ll += a[s[0]] + x[s[0]] + b[s[length - 1]];
-  for (int k = 1; k < length; ++k) {
-    ll += x[k * numClasses_ + s[k]] + w[s[k - 1] * numClasses_ + s[k]];
-  }
-
-  VLOG(1) << "ll=" << ll;
-  return -ll;
-}
-
-void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
-  Matrix::resizeOrCreate(matGrad_, length, numClasses_);
-  Matrix::resizeOrCreate(beta_, length, numClasses_);
-  real* b = b_->getData();
-  if (needWGrad) {
-    Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_);
-    matWGrad_->zeroMem();
-    da_ = matWGrad_->subRowMatrix(0, 1);
-    db_ = matWGrad_->subRowMatrix(1, 2);
-    dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2);
-  }
-
-  real* alpha = alpha_->getData();
-  real* beta = beta_->getData();
-  real* expW = expW_->getData();
-  real* expX = expX_->getData();
-  real* grad = matGrad_->getData();
-
-  for (int i = 0; i < numClasses_; ++i) {
-    beta[(length - 1) * numClasses_ + i] = exp(b[i]);
-  }
-  normalizeL1(beta + (length - 1) * numClasses_, numClasses_);
-
-  for (int k = length - 2; k >= 0; --k) {
-    for (int i = 0; i < numClasses_; ++i) {
-      real sum = 0;
-      for (int j = 0; j < numClasses_; ++j) {
-        sum += expW[i * numClasses_ + j]  // (**)
-               * beta[(k + 1) * numClasses_ + j] *
-               expX[(k + 1) * numClasses_ + j];
-      }
-      beta[k * numClasses_ + i] = sum;
-    }
-    // normalizeL1 is to avoid underflow or overflow at (**)
-    normalizeL1(beta + k * numClasses_, numClasses_);
-  }
-
-  matGrad_->dotMul(*alpha_, *beta_);
-  matGrad_->rowNormalizeL1(*matGrad_);
-  for (int k = 0; k < length; ++k) {
-    grad[k * numClasses_ + s[k]] -= (real)1;
-  }
-
-  if (needWGrad) {
-    da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
-    db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1));
-
-    beta_->dotMul(*beta_, *expX_);
-    beta_->rowNormalizeL1(*beta_);
-
-    real* dw = dw_->getData();
-    for (int k = 1; k < length; ++k) {
-      real sum = 0;
-      for (int i = 0; i < numClasses_; ++i) {
-        for (int j = 0; j < numClasses_; ++j) {
-          sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
-                 beta[k * numClasses_ + j];
-        }
-      }
-      sum = 1 / sum;
-      for (int i = 0; i < numClasses_; ++i) {
-        for (int j = 0; j < numClasses_; ++j) {
-          dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
-                                     alpha[(k - 1) * numClasses_ + i] *
-                                     beta[k * numClasses_ + j];
-        }
-      }
-      dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
-    }
-  }
-}
-
-void LinearChainCRF::decode(real* x, int* s, int length) {
-  Matrix::resizeOrCreate(alpha_, length, numClasses_);
-  real* a = a_->getData();
-  real* b = b_->getData();
-  real* w = w_->getData();
-  IVector::resizeOrCreate(track_, numClasses_ * length, /* useGpu= */ false);
-  int* track = track_->getData();
-  real* alpha = alpha_->getData();
-
-  for (int i = 0; i < numClasses_; ++i) {
-    alpha[i] = a[i] + x[i];
-  }
-  for (int k = 1; k < length; ++k) {
-    for (int i = 0; i < numClasses_; ++i) {
-      real maxScore = -std::numeric_limits<real>::max();
-      int maxJ = 0;
-      for (int j = 0; j < numClasses_; ++j) {
-        real score = alpha[(k - 1) * numClasses_ + j] + w[j * numClasses_ + i];
-        if (score > maxScore) {
-          maxScore = score;
-          maxJ = j;
-        }
-      }
-      alpha[k * numClasses_ + i] = maxScore + x[k * numClasses_ + i];
-      track[k * numClasses_ + i] = maxJ;
-    }
-  }
-  real maxScore = -std::numeric_limits<real>::max();
-  int maxI = 0;
-  for (int i = 0; i < numClasses_; ++i) {
-    real score = alpha[(length - 1) * numClasses_ + i] + b[i];
-    if (score > maxScore) {
-      maxScore = score;
-      maxI = i;
-    }
-  }
-  s[length - 1] = maxI;
-  for (int k = length - 1; k >= 1; --k) {
-    s[k - 1] = maxI = track[k * numClasses_ + maxI];
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LinearChainCRF.h b/paddle/legacy/gserver/layers/LinearChainCRF.h
deleted file mode 100644
index 65e239054..000000000
--- a/paddle/legacy/gserver/layers/LinearChainCRF.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCRF {
- public:
-  /**
-   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
-   * The first numClasses values of para are for starting weights (\f$a\f$).
-   * The next numClasses values of para are for ending weights (\f$b\f$),
-   * The remaning values are for transition weights (\f$w\f$).
-   *
-   * The probability of a state sequence s of length \f$L\f$ is defined as:
-   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
-   *                  + \sum_{l=1}^L x_{s_l}
-   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
-   * all possible
-   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
-   */
-  LinearChainCRF(int numClasses, real* para);
-
-  /**
-   * Calculate the negative log likelihood of s given x.
-   * The size of x must be length * numClasses. Each consecutive numClasses
-   * values are the features for one time step.
-   */
-  real forward(real* x, int* s, int length);
-
-  /**
-   * Calculate the gradient with respect to x, a, b, and w.
-   * backward() can only be called after a corresponding call to forward() with
-   * the same x, s and length.
-   * The gradient with respect to a, b, and w will not be calculated if
-   * needWGrad is false.
-   * @note Please call getWGrad() and getXGrad() to get the gradient with
-   * respect to (a, b, w) and x respectively.
-   */
-  void backward(real* x, int* s, int length, bool needWGrad);
-
-  /**
-   * Find the most probable sequence given x. The result will be stored in s.
-   */
-  void decode(real* x, int* s, int length);
-
-  /*
-   * Return the gradient with respect to (a, b, w). It can only be called after
-   * a corresponding call to backward().
-   */
-  MatrixPtr getWGrad() { return matWGrad_; }
-
-  /*
-   * Return the gradient with respect to x. It can only be called after a
-   * corresponding call to backward().
-   */
-  MatrixPtr getXGrad() { return matGrad_; }
-
- protected:
-  int numClasses_;
-  MatrixPtr a_;
-  MatrixPtr b_;
-  MatrixPtr w_;
-  MatrixPtr matWGrad_;
-  MatrixPtr da_;
-  MatrixPtr db_;
-  MatrixPtr dw_;
-  MatrixPtr ones_;
-
-  MatrixPtr expX_;
-  MatrixPtr matGrad_;
-  MatrixPtr alpha_;
-  MatrixPtr beta_;
-  MatrixPtr maxX_;
-  MatrixPtr expW_;
-
-  // track_(k,i) = j means that the best sequence at time k for class i comes
-  // from the sequence at time k-1 for class j
-  IVectorPtr track_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LinearChainCTC.cpp b/paddle/legacy/gserver/layers/LinearChainCTC.cpp
deleted file mode 100644
index 1fad545b7..000000000
--- a/paddle/legacy/gserver/layers/LinearChainCTC.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LinearChainCTC.h"
-#include <math.h>
-#include <limits>
-
-namespace paddle {
-
-/* log scale */
-const real EXP_MAX = std::numeric_limits<real>::max();
-const real EXP_MIN = std::numeric_limits<real>::min();
-const real LOG_ZERO = std::log(EXP_MIN);
-const real LOG_INFINITY = std::log(EXP_MAX);
-
-static inline real safeExp(real x) {
-  if (x <= LOG_ZERO) {
-    return 0;
-  }
-  if (x >= LOG_INFINITY) {
-    return EXP_MAX;
-  }
-  return std::exp(x);
-}
-
-static inline real safeLog(real x) {
-  if (x <= EXP_MIN) {
-    return LOG_ZERO;
-  }
-  return std::log(x);
-}
-
-// x=lna and y=lnb is log scale, ln(a/b)=lna-lnb
-static inline real logDiv(real x, real y) {
-  if (x - y <= LOG_ZERO) {
-    return LOG_ZERO;
-  }
-  if (x - y >= LOG_INFINITY) {
-    return LOG_INFINITY;
-  }
-  return x - y;
-}
-
-// x=lna and y=lnb is log scale, ln(a*b)=lna+lnb
-static inline real logMul(real x, real y) {
-  if (x + y <= LOG_ZERO) {
-    return LOG_ZERO;
-  }
-  if (x + y >= LOG_INFINITY) {
-    return LOG_INFINITY;
-  }
-  return x + y;
-}
-
-// x=lna and y=lnb is log scale, ln(a+b)=lna+ln(1+exp(lnb-lna)), where b > a
-static inline real logAdd(real x, real y) {
-  if (x < y) {
-    real t = y;
-    y = x;
-    x = t;
-  }
-  return x + safeLog(1 + safeExp(y - x));
-}
-
-static void setLogZero(MatrixPtr mat) {
-  size_t size = mat->getElementCnt();
-  real* data = mat->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] = LOG_ZERO;
-  }
-}
-
-LinearChainCTC::LinearChainCTC(int numClasses, bool normByTimes)
-    : numClasses_(numClasses), normByTimes_(normByTimes), logProb_(0) {
-  // set the class label of blank as "numClasses-1"
-  blank_ = numClasses - 1;
-
-  Matrix::resizeOrCreate(gradTerms_, 1, numClasses_);
-}
-
-real LinearChainCTC::forward(real* softmaxSeq,
-                             int softmaxSeqLen,
-                             int* labelSeq,
-                             int labelSeqLen) {
-  isInvalid_ = false;
-  totalTime_ = softmaxSeqLen;
-  totalSegments_ = labelSeqLen * 2 + 1;
-
-  int requiredTime = labelSeqLen;
-  int oldLabel = -1;
-
-  for (int i = 0; i < labelSeqLen; i++) {
-    if (labelSeq[i] == oldLabel) {
-      requiredTime++;
-    }
-    oldLabel = labelSeq[i];
-  }
-
-  if (totalTime_ < requiredTime) {
-    isInvalid_ = true;
-    return 0;
-  }
-
-  /* calculate the forward and backward variables,
-   * reference Chapter 7.3 of "Alex Grave, Supervised Sequence
-   * Labelling with Recurrent Neural Networks" */
-  Matrix::resizeOrCreate(logActs_, totalTime_, numClasses_, false, false);
-  real* logActsData = logActs_->getData();
-  for (int i = 0; i < totalTime_ * numClasses_; i++) {
-    logActsData[i] = safeLog(softmaxSeq[i]);
-  }
-
-  Matrix::resizeOrCreate(forwardVars_, totalTime_, totalSegments_);
-  Matrix::resizeOrCreate(backwardVars_, totalTime_, totalSegments_);
-
-  /* calculate the forward variables */
-  setLogZero(forwardVars_);
-  real* fwdVars = forwardVars_->getData();
-
-  /* dp initialization at t0 */
-  fwdVars[0] = logActs_->getData()[blank_];
-  if (totalSegments_ > 1) {
-    fwdVars[1] = logActs_->getData()[labelSeq[0]];
-  }
-  /* dp from t1 */
-  for (int i = 1; i < totalTime_; i++) {
-    real* dataPerStep = logActsData + i * numClasses_;
-    real* oldFvars = fwdVars + (i - 1) * totalSegments_;
-    real* fvars = fwdVars + i * totalSegments_;
-    int start, end;
-    segmentRange(start, end, i);
-    for (int j = start; j < end; j++) {
-      real fv;
-      if (j & 1) {
-        int labelIdx = j / 2;
-        int labelVal = labelSeq[labelIdx];
-        fv = logAdd(oldFvars[j], oldFvars[j - 1]);
-        if (j > 1 && (labelVal != labelSeq[labelIdx - 1])) {
-          fv = logAdd(fv, oldFvars[j - 2]);
-        }
-        fv = logMul(fv, dataPerStep[labelVal]);
-      } else {
-        fv = oldFvars[j];
-        if (j) {
-          fv = logAdd(fv, oldFvars[j - 1]);
-        }
-        fv = logMul(fv, dataPerStep[blank_]);
-      }
-      fvars[j] = fv;
-    }
-  }
-
-  real* lastFvs = fwdVars + (totalTime_ - 1) * totalSegments_;
-
-  /* sum the last two value as logprob */
-  logProb_ = lastFvs[totalSegments_ - 1];
-  if (totalSegments_ > 1) {
-    logProb_ = logAdd(logProb_, lastFvs[totalSegments_ - 2]);
-  }
-
-  /* calculate the backward variables */
-  setLogZero(backwardVars_);
-  real* bwdVars = backwardVars_->getData();
-  real* lastBvs = bwdVars + (totalTime_ - 1) * totalSegments_;
-
-  lastBvs[totalSegments_ - 1] = 0;
-  if (totalSegments_ > 1) {
-    lastBvs[totalSegments_ - 2] = 0;
-  }
-
-  for (int i = totalTime_ - 2; i >= 0; i--) {
-    real* oldDataPerStep = logActsData + (i + 1) * numClasses_;
-    real* oldBvars = bwdVars + (i + 1) * totalSegments_;
-    real* bvars = bwdVars + i * totalSegments_;
-    int start, end;
-    segmentRange(start, end, i);
-    for (int j = start; j < end; j++) {
-      real bv;
-      if (j & 1) {
-        int labelIdx = j / 2;
-        int labelVal = labelSeq[labelIdx];
-
-        bv = logAdd(logMul(oldBvars[j], oldDataPerStep[labelVal]),
-                    logMul(oldBvars[j + 1], oldDataPerStep[blank_]));
-        if (j < (totalSegments_ - 2)) {
-          int nextLabelVal = labelSeq[labelIdx + 1];
-          if (labelVal != nextLabelVal) {
-            bv = logAdd(bv,
-                        logMul(oldBvars[j + 2], oldDataPerStep[nextLabelVal]));
-          }
-        }
-      } else {
-        bv = logMul(oldBvars[j], oldDataPerStep[blank_]);
-        if (j < (totalSegments_ - 1)) {
-          bv = logAdd(bv,
-                      logMul(oldBvars[j + 1], oldDataPerStep[labelSeq[j / 2]]));
-        }
-      }
-      bvars[j] = bv;
-    }
-  }
-
-  VLOG(1) << "ctcLoss=" << -logProb_;
-
-  return -logProb_;
-}
-
-void LinearChainCTC::backward(real* softmaxSeq,
-                              real* grad,
-                              int* labelSeq,
-                              int labelSeqLen) {
-  /* if not meet the conditions of CTC computing, then set the grads to zeros */
-  if (isInvalid_) {
-    for (int i = 0; i < totalTime_ * numClasses_; i++) {
-      grad[i] += 0;
-    }
-    return;
-  }
-
-  real* fwdVars = forwardVars_->getData();
-  real* bwdVars = backwardVars_->getData();
-  real* logActsData = logActs_->getData();
-
-  for (int i = 0; i < totalTime_; i++) {
-    setLogZero(gradTerms_);
-    real* gradTermsData = gradTerms_->getData();
-    real* fvars = fwdVars + i * totalSegments_;
-    real* bvars = bwdVars + i * totalSegments_;
-    for (int j = 0; j < totalSegments_; j++) {
-      int k = (j & 1) ? labelSeq[j / 2] : blank_;
-      gradTermsData[k] = logAdd(gradTermsData[k], logMul(fvars[j], bvars[j]));
-    }
-    for (int j = 0; j < numClasses_; j++) {
-      if (normByTimes_) {
-        grad[i * numClasses_ + j] +=
-            -safeExp(
-                logDiv(gradTermsData[j],
-                       logMul(logProb_, logActsData[i * numClasses_ + j]))) /
-            totalTime_;
-      } else {
-        grad[i * numClasses_ + j] += -safeExp(
-            logDiv(gradTermsData[j],
-                   logMul(logProb_, logActsData[i * numClasses_ + j])));
-      }
-    }
-  }
-}
-
-void LinearChainCTC::segmentRange(int& start, int& end, int time) {
-  start = std::max(0, totalSegments_ - (2 * (totalTime_ - time)));
-  end = std::min(totalSegments_, 2 * (time + 1));
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LinearChainCTC.h b/paddle/legacy/gserver/layers/LinearChainCTC.h
deleted file mode 100644
index e6c4c7bfe..000000000
--- a/paddle/legacy/gserver/layers/LinearChainCTC.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCTC {
- public:
-  LinearChainCTC(int numClasses, bool normByTimes);
-
-  // Calculate the negative log probability as loss
-  real forward(real* softmaxSeq,
-               int softmaxSeqLen,
-               int* labelSeq,
-               int labelSeqLen);
-
-  // calculate the gradient
-  void backward(real* softmaxSeq,
-                real* softmaxSeqGrad,
-                int* labelSeq,
-                int labelSeqLen);
-
- protected:
-  int numClasses_, blank_, totalSegments_, totalTime_;
-  bool normByTimes_;
-  bool isInvalid_;
-
-  MatrixPtr logActs_, forwardVars_, backwardVars_, gradTerms_;
-
-  real logProb_;
-
-  void segmentRange(int& start, int& end, int time);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmCompute.cpp b/paddle/legacy/gserver/layers/LstmCompute.cpp
deleted file mode 100644
index 70f08e1d4..000000000
--- a/paddle/legacy/gserver/layers/LstmCompute.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmCompute.h"
-#include "hl_recurrent_apply.cuh"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-void LstmCompute::init(LayerConfig &config) {
-  activeNode_ = hlActiveType(config.active_type());
-  activeGate_ = hlActiveType(config.active_gate_type());
-  activeState_ = hlActiveType(config.active_state_type());
-}
-
-template <>
-void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
-  hl_cpu_lstm_forward(hppl::forward::lstm(),
-                      value,
-                      frameSize,
-                      activeNode_,
-                      activeGate_,
-                      activeState_);
-}
-
-template <>
-void LstmCompute::backwardOneSequence<0>(hl_lstm_value value,
-                                         hl_lstm_grad grad,
-                                         int frameSize) {
-  hl_cpu_lstm_backward(hppl::backward::lstm(),
-                       value,
-                       grad,
-                       frameSize,
-                       activeNode_,
-                       activeGate_,
-                       activeState_);
-}
-
-template <>
-void LstmCompute::forwardBatch<0>(hl_lstm_value value,
-                                  int frameSize,
-                                  int batchSize) {
-  for (int b = 0; b < batchSize; b++) {
-    forwardOneSequence<0>(value, frameSize);
-
-    value.gateValue += frameSize * 4;
-    value.stateValue += frameSize;
-    value.stateActiveValue += frameSize;
-    value.outputValue += frameSize;
-    if (value.prevStateValue) {
-      value.prevStateValue += frameSize;
-    }
-  }
-}
-
-template <>
-void LstmCompute::backwardBatch<0>(hl_lstm_value value,
-                                   hl_lstm_grad grad,
-                                   int frameSize,
-                                   int batchSize) {
-  for (int b = 0; b < batchSize; b++) {
-    backwardOneSequence<0>(value, grad, frameSize);
-
-    value.gateValue += frameSize * 4;
-    value.stateValue += frameSize;
-    value.stateActiveValue += frameSize;
-    value.outputValue += frameSize;
-    if (value.prevStateValue) {
-      value.prevStateValue += frameSize;
-    }
-
-    grad.gateGrad += frameSize * 4;
-    grad.stateGrad += frameSize;
-    grad.stateActiveGrad += frameSize;
-    grad.outputGrad += frameSize;
-    if (grad.prevStateGrad) {
-      grad.prevStateGrad += frameSize;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmCompute.cu b/paddle/legacy/gserver/layers/LstmCompute.cu
deleted file mode 100644
index 3f15edcac..000000000
--- a/paddle/legacy/gserver/layers/LstmCompute.cu
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmCompute.h"
-#include "hl_recurrent_apply.cuh"
-
-namespace paddle {
-
-template <>
-void LstmCompute::forwardBatch<1>(hl_lstm_value value,
-                                  int frameSize,
-                                  int batchSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(),
-                      value,
-                      frameSize,
-                      batchSize,
-                      activeNode_,
-                      activeGate_,
-                      activeState_);
-}
-
-template <>
-void LstmCompute::backwardBatch<1>(hl_lstm_value value,
-                                   hl_lstm_grad grad,
-                                   int frameSize,
-                                   int batchSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(),
-                       value,
-                       grad,
-                       frameSize,
-                       batchSize,
-                       activeNode_,
-                       activeGate_,
-                       activeState_);
-}
-
-template <>
-void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(),
-                      value,
-                      frameSize,
-                      /* batchSize */ 1,
-                      activeNode_,
-                      activeGate_,
-                      activeState_);
-}
-
-template <>
-void LstmCompute::backwardOneSequence<1>(hl_lstm_value value,
-                                         hl_lstm_grad grad,
-                                         int frameSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(),
-                       value,
-                       grad,
-                       frameSize,
-                       /* batchSize */ 1,
-                       activeNode_,
-                       activeGate_,
-                       activeState_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmCompute.h b/paddle/legacy/gserver/layers/LstmCompute.h
deleted file mode 100644
index ac40c35ef..000000000
--- a/paddle/legacy/gserver/layers/LstmCompute.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-class LstmCompute {
- public:
-  void init(LayerConfig &config);
-
-  /**
-   * LstmLayer batch compute API (forwardBatch, backwardBatch).
-   * If use batch compute api, lstm value(and grad) need to be batch structure.
-   * Compute order:
-   *   forwardBatch:  for 0 <= id < numBatch
-   *   backwardBatch:  for numBatch > id >= 0
-   */
-  template <bool useGpu>
-  void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
-
-  template <bool useGpu>
-  void backwardBatch(hl_lstm_value value,
-                     hl_lstm_grad grad,
-                     int frameSize,
-                     int batchSize);
-
-  /**
-   * LstmLayer sequence compute API (forwardOneSequence, backwardOneSequence).
-   * Compute order(for each sequence):
-   *   forwardOneSequence:
-   *     if (!reversed) for 0 <= seqId < seqLength
-   *     if (reversed)  for seqLength > seqId >= 0
-   *   backwardOneSequence:
-   *     if (!reversed) for seqLength > seqId >= 0
-   *     if (reversed)  for 0 <= seqId < seqLength
-   */
-  template <bool useGpu>
-  void forwardOneSequence(hl_lstm_value value, int frameSize);
-  template <bool useGpu>
-  void backwardOneSequence(hl_lstm_value value,
-                           hl_lstm_grad grad,
-                           int frameSize);
-
- public:
-  hl_activation_mode_t activeNode_;
-  hl_activation_mode_t activeGate_;
-  hl_activation_mode_t activeState_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmLayer.cpp b/paddle/legacy/gserver/layers/LstmLayer.cpp
deleted file mode 100644
index 43a55d8d4..000000000
--- a/paddle/legacy/gserver/layers/LstmLayer.cpp
+++ /dev/null
@@ -1,805 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_bool(prev_batch_state);
-
-namespace paddle {
-
-REGISTER_LAYER(lstmemory, LstmLayer);
-
-bool LstmLayer::init(const LayerMap &layerMap,
-                     const ParameterMap &parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize() * 4, parameters_[0]->getSize());
-  CHECK_EQ(getSize() * 7, biasParameter_->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 4, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
-    if (bias_->getW()) {
-      localBias_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  getSize() * 4,
-                                  /* trans= */ false,
-                                  useGpu_);
-      checkIg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkFg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkOg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-
-      localBias_->setData(bias_->getW()->getData());
-      checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
-      checkFg_->setData(bias_->getW()->getData() + getSize() * 5);
-      checkOg_->setData(bias_->getW()->getData() + getSize() * 6);
-    }
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_ = Matrix::create(nullptr,
-                                      /* height= */ 1,
-                                      getSize() * 4,
-                                      /* trans= */ false,
-                                      useGpu_);
-      checkIgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkFgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkOgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
-      checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
-      checkOgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 6);
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  reversed_ = config_.reversed();
-
-  // create IdentityActivation for using drop_rate
-  activation_.reset(ActivationFunction::create(""));
-
-  LstmCompute::init(config_);
-  useBatch_ = true;
-  useSeqParallel_ = false;
-  if (useGpu_ && (getSize() == 32 || getSize() == 64)) {
-    useSeqParallel_ = true;
-  }
-
-  return true;
-}
-
-void LstmLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->resize(0, getSize());
-  prevState_->resize(0, getSize());
-  if (FLAGS_prev_batch_state) {
-    useBatch_ = true;
-  } else {
-    useBatch_ = false;
-  }
-}
-
-void LstmLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 2) << "two matrices are expected for LSTM state";
-  prevOutput_->resize(state->value[0]->getHeight(),
-                      state->value[0]->getWidth());
-  prevState_->resize(state->value[1]->getHeight(), state->value[1]->getWidth());
-  prevOutput_->copyFrom(*(state->value[0]));
-  prevState_->copyFrom(*(state->value[1]));
-}
-
-LayerStatePtr LstmLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  if (prevOutput_->getHeight() && prevOutput_->getWidth()) {
-    res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-    res->value[0]->copyFrom(*prevOutput_);
-    res->value.push_back(prevState_->clone(0, 0, useGpu_));
-    res->value[1]->copyFrom(*prevState_);
-  } else {
-    MatrixPtr output =
-        Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    MatrixPtr state = Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    output->resize(0, getSize());
-    state->resize(0, getSize());
-    res->value.push_back(output);
-    res->value.push_back(state);
-  }
-  return res;
-}
-
-void LstmLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("LstmFwTimer", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize() * 4, input.value->getWidth());
-  size_t numSequences = input.getNumSequences();
-  const int *starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  if (prevOutput_) {
-    size_t prevNumSeq = useBatch_ ? numSequences : 1;
-    if (prevOutput_->getHeight() == 0) {
-      prevOutput_->resize(prevNumSeq, getSize());
-      prevState_->resize(prevNumSeq, getSize());
-      prevOutput_->zeroMem();
-      prevState_->zeroMem();
-    } else {
-      CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
-          << "the number of sequences must be the same";
-    }
-    Matrix::resizeOrCreate(totalState_,
-                           prevState_->getHeight() + batchSize,
-                           getSize(),
-                           /*trans*/ false,
-                           useGpu_);
-    state_.value = Matrix::create(nullptr,
-                                  /* height= */ batchSize,
-                                  getSize(),
-                                  /* trans= */ false,
-                                  useGpu_);
-    state_.value->setData(totalState_->getData() +
-                          prevState_->getHeight() * getSize());
-  } else {
-    Matrix::resizeOrCreate(state_.value,
-                           /* height= */ batchSize,
-                           getSize(),
-                           /* trans= */ false,
-                           useGpu_);
-  }
-  Matrix::resizeOrCreate(preOutput_.value,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (!useBatch_) {
-    forwardSequence(batchSize, numSequences, starts, input.value);
-  } else {
-    if (!useSeqParallel_) {
-      forwardBatch(batchSize, numSequences, starts, input.value);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      forwardSeqParallel(batchSize, numSequences, starts, input.value);
-    }
-  }
-  /*  activation */ { forwardActivation(); }
-}
-
-void LstmLayer::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("LstmBwTimer", getName().c_str());
-  /*  Do derivation */ { backwardActivation(); }
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(state_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(preOutput_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  state_.grad->zero();
-
-  const int *starts = input.sequenceStartPositions->getData(false);
-  if (!useBatch_) {
-    backwardSequence(batchSize, numSequences, starts, input.grad);
-  } else {
-    if (!useSeqParallel_) {
-      backwardBatch(batchSize, numSequences, starts, input.grad);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      backwardSeqParallel(batchSize, numSequences, starts, input.grad);
-    }
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void LstmLayer::forwardSequence(int batchSize,
-                                size_t numSequences,
-                                const int *starts,
-                                MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = output_.value->getData();
-  lstmValue.prevStateValue = nullptr;
-  if (reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmValue.outputValue += (batchSize - 1) * getSize();
-  }
-
-  auto nextFrame = [&lstmValue](bool reversed, int frameSize) {
-    lstmValue.prevStateValue = lstmValue.stateValue;
-    if (!reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmValue.outputValue += frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmValue.outputValue -= frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  if (!reversed_) {
-    if (prevState_) {
-      lstmValue.prevStateValue = prevState_->getData();
-    }
-    if (prevOutput_) {
-      frameGate->setData(lstmValue.gateValue);
-      frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1);
-    }
-  }
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t n = 0; n < numSequences; ++n) {
-    int length;
-    if (!reversed_) {
-      length = starts[n + 1] - starts[n];
-    } else {
-      length = starts[numSequences - n] - starts[numSequences - n - 1];
-    }
-    for (int l = 0; l < length; ++l) {
-      if (useGpu_) {
-        LstmCompute::forwardOneSequence<1>(lstmValue, getSize());
-      } else {
-        LstmCompute::forwardOneSequence<0>(lstmValue, getSize());
-      }
-
-      if (l != length - 1) {
-        frameOutput->setData(lstmValue.outputValue);
-        nextFrame(reversed_, getSize());
-        frameGate->setData(lstmValue.gateValue);
-        frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-      }
-    }
-    if (n != numSequences - 1) {
-      frameOutput->setData(lstmValue.outputValue);
-      nextFrame(reversed_, getSize());
-      frameGate->setData(lstmValue.gateValue);
-      if (!reversed_) {
-        if (!prevState_) lstmValue.prevStateValue = nullptr;
-        if (prevOutput_) {
-          frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-        }
-      } else {
-        lstmValue.prevStateValue = nullptr;
-      }
-    }
-  }
-
-  if (!reversed_) {
-    if (prevState_) {
-      prevState_->assign(*state_.value->subMatrix(batchSize - 1, 1));
-    }
-    if (prevOutput_) {
-      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
-    }
-  }
-}
-
-void LstmLayer::backwardSequence(int batchSize,
-                                 size_t numSequences,
-                                 const int *starts,
-                                 MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-
-  hl_lstm_value lstmValue;
-  hl_lstm_grad lstmGrad;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = nullptr;
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-  lstmGrad.gateGrad = gate_.grad->getData();
-  lstmGrad.stateGrad = state_.grad->getData();
-  lstmGrad.stateActiveGrad = nullptr;
-  lstmGrad.outputGrad = output_.grad->getData();
-
-  if (!reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmGrad.gateGrad += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmGrad.stateGrad += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmGrad.outputGrad += (batchSize - 1) * getSize();
-    lstmValue.prevStateValue = lstmValue.stateValue - getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad - getSize();
-  } else {
-    lstmValue.prevStateValue = lstmValue.stateValue + getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad + getSize();
-  }
-
-  auto nextFrame = [&lstmValue, &lstmGrad](bool reversed, int frameSize) {
-    if (reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmGrad.gateGrad += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmGrad.stateGrad += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmGrad.outputGrad += frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue + frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad + frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmGrad.gateGrad -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmGrad.stateGrad -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmGrad.outputGrad -= frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue - frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad - frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t n = 0; n < numSequences; ++n) {
-      int length;
-      int start;
-      if (reversed_) {
-        length = starts[n + 1] - starts[n];
-        start = starts[n];
-      } else {
-        length = starts[numSequences - n] - starts[numSequences - n - 1];
-        start = starts[numSequences - n - 1];
-      }
-      for (int l = 0; l < length; ++l) {
-        if (l == length - 1) {
-          lstmValue.prevStateValue = nullptr;
-          lstmGrad.prevStateGrad = nullptr;
-        }
-        if (useGpu_) {
-          LstmCompute::backwardOneSequence<1>(lstmValue, lstmGrad, getSize());
-        } else {
-          LstmCompute::backwardOneSequence<0>(lstmValue, lstmGrad, getSize());
-        }
-        if (l != length - 1) {
-          frameGate->setData(lstmGrad.gateGrad);
-          nextFrame(reversed_, getSize());
-          frameOutput->setData(lstmGrad.outputGrad);
-          frameOutput->mul(*frameGate, *weightT, 1, 1);
-        } else {
-          nextFrame(reversed_, getSize());
-        }
-      }
-
-      if (weight_->getWGrad()) {
-        if (!reversed_) {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start + 1, length - 1),
-              1,
-              1);
-        } else {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start, length - 1),
-              1,
-              1);
-        }
-      }
-    }
-  }
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-}
-
-void LstmLayer::forwardBatch(int batchSize,
-                             size_t numSequences,
-                             const int *starts,
-                             MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchValue_->resizeOrCreateBatch(
-      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
-
-  batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  {
-    int numBatch = batchValue_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    if (prevState_) {
-      lstmValue.prevStateValue = totalState_->getData();
-    } else {
-      lstmValue.prevStateValue = nullptr;
-    }
-    for (int n = 0; n < numBatch; n++) {
-      MatrixPtr outputValue = batchValue_->getBatchValue(n);
-      MatrixPtr gateValue = batchValue_->getBatchValue(*gate_.value, n);
-      batchSize = outputValue->getHeight();
-
-      if (n != 0) {
-        MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
-        gateValue->mul(*batch1, *weight_->getW(), 1, 1);
-      } else if (prevOutput_) {
-        Matrix::resizeOrCreate(prevBatchOutput2_,
-                               gateValue->getHeight(),
-                               getSize(),
-                               false,
-                               useGpu_);
-        batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
-        gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1);
-
-        batchValue_->prevOutput2Batch(*prevState_,
-                                      *totalState_->subMatrix(0, numSequences));
-      }
-
-      lstmValue.gateValue = gateValue->getData();
-      lstmValue.outputValue = outputValue->getData();
-      lstmValue.stateValue =
-          batchValue_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchValue_->getBatchValue(*preOutput_.value, n)->getData();
-      {
-        if (useGpu_) {
-          LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
-        } else {
-          LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
-        }
-      }
-      lstmValue.prevStateValue = lstmValue.stateValue;
-    }
-  }
-  {
-    REGISTER_TIMER_INFO("batchToSeq", getName().c_str());
-    batchValue_->copyBackSeq(*output_.value);
-  }
-  if (prevOutput_) {
-    getPrevBatchOutput(numSequences);
-    getPrevBatchState(numSequences);
-  }
-}
-
-void LstmLayer::getPrevBatchOutput(size_t numSequences) {
-  prevOutput_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevOutput_,
-                                     *batchValue_->getBatchValue());
-}
-
-void LstmLayer::getPrevBatchState(size_t numSequences) {
-  prevState_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
-}
-
-void LstmLayer::backwardBatch(int batchSize,
-                              size_t numSequences,
-                              const int *starts,
-                              MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  hl_lstm_grad lstmGrad;
-  lstmGrad.stateActiveGrad = preOutput_.grad->getData();
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  {
-    REGISTER_TIMER_INFO("seqToBatch", getName().c_str());
-    batchGrad_->copyFromSeq(*output_.grad);
-  }
-
-  {
-    MatrixPtr weightT = weight_->getW()->getTranspose();
-    int numBatch = batchGrad_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr outputGrad = batchGrad_->getBatchValue(n);
-      MatrixPtr gateGrad = batchGrad_->getBatchValue(*gate_.grad, n);
-
-      lstmValue.gateValue =
-          batchGrad_->getBatchValue(*gate_.value, n)->getData();
-      lstmValue.stateValue =
-          batchGrad_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchGrad_->getBatchValue(*preOutput_.value, n)->getData();
-      lstmGrad.stateGrad =
-          batchGrad_->getBatchValue(*state_.grad, n)->getData();
-      lstmGrad.gateGrad = gateGrad->getData();
-      lstmGrad.outputGrad = outputGrad->getData();
-      {
-        batchSize = outputGrad->getHeight();
-        if (n != 0) {
-          lstmValue.prevStateValue =
-              batchGrad_->getBatchValue(*state_.value, n - 1)->getData();
-          lstmGrad.prevStateGrad =
-              batchGrad_->getBatchValue(*state_.grad, n - 1)->getData();
-        } else {
-          if (prevState_) {
-            lstmValue.prevStateValue = totalState_->getData();
-            lstmGrad.prevStateGrad = nullptr;
-          } else {
-            lstmValue.prevStateValue = nullptr;
-            lstmGrad.prevStateGrad = nullptr;
-          }
-        }
-        if (useGpu_) {
-          LstmCompute::backwardBatch<1>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        } else {
-          LstmCompute::backwardBatch<0>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        }
-      }
-
-      if (n != 0) {
-        MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
-        tmp->mul(*gateGrad, *weightT, 1, 1);
-      }
-
-      if (n != 0 && weight_->getWGrad()) {
-        /* backward weight */
-        MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
-        weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1);
-      } else if (prevOutput_ && weight_->getWGrad()) {
-        weight_->getWGrad()->mul(
-            *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1);
-      }
-    }
-  }
-
-  if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, /* scale */ 1);
-  }
-}
-
-void LstmLayer::forwardSeqParallel(int batchSize,
-                                   size_t numSequences,
-                                   const int *starts,
-                                   MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, /* scale */ 1);
-  }
-
-  real *gateValue = gate_.value->getData();
-  real *stateValue = state_.value->getData();
-  real *outputValue = output_.value->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *weight = weight_->getW()->getData();
-  hl_lstm_parallel_forward(gateValue,
-                           stateValue,
-                           preOutputValue,
-                           outputValue,
-                           checkIg,
-                           checkFg,
-                           checkOg,
-                           weight,
-                           starts,
-                           getSize(),
-                           numSequences,
-                           reversed_,
-                           activeNode_,
-                           activeGate_,
-                           activeState_);
-}
-
-void LstmLayer::backwardSeqParallel(int batchSize,
-                                    size_t numSequences,
-                                    const int *starts,
-                                    MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
-  real *gateValue = gate_.value->getData();
-  real *gateGrad = gate_.grad->getData();
-  real *stateValue = state_.value->getData();
-  real *stateGrad = state_.grad->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *preOutputGrad = preOutput_.grad->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *outputGrad = output_.grad->getData();
-  real *weight = weight_->getW()->getData();
-
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-  if (bias_->getWGrad()) {
-    checkIgGrad = checkIgGrad_->getData();
-    checkFgGrad = checkFgGrad_->getData();
-    checkOgGrad = checkOgGrad_->getData();
-  } else {
-    checkIgGrad = nullptr;
-    checkFgGrad = nullptr;
-    checkOgGrad = nullptr;
-  }
-
-  hl_lstm_parallel_backward_data(gateValue,
-                                 gateGrad,
-                                 stateValue,
-                                 stateGrad,
-                                 preOutputValue,
-                                 preOutputGrad,
-                                 outputGrad,
-                                 checkIg,
-                                 checkIgGrad,
-                                 checkFg,
-                                 checkFgGrad,
-                                 checkOg,
-                                 checkOgGrad,
-                                 weight,
-                                 starts,
-                                 getSize(),
-                                 numSequences,
-                                 reversed_,
-                                 activeNode_,
-                                 activeGate_,
-                                 activeState_);
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-
-  real *outputValue = output_.value->getData();
-  if (weight_->getWGrad()) {
-    real *weightGrad = weight_->getWGrad()->getData();
-    hl_lstm_parallel_backward_weight(weightGrad,
-                                     outputValue,
-                                     gateGrad,
-                                     starts,
-                                     getSize(),
-                                     batchSize,
-                                     numSequences,
-                                     reversed_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmLayer.h b/paddle/legacy/gserver/layers/LstmLayer.h
deleted file mode 100644
index 8c8b382f5..000000000
--- a/paddle/legacy/gserver/layers/LstmLayer.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "LstmCompute.h"
-#include "SequenceToBatch.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-namespace paddle {
-
-/**
- * @brief LstmLayer takes 1 input layer with size * 4.
- * Input layer is diveded into 4 equal parts:
- *   (input_s, input_ig, input_fg, input_og)
- *
- * For each sequence [start, end] it performs the following computation:
- * @code
- * output_{i} = actState(state_{i}) * actGate(outputGate_{i})
- * state_{i} = actInput(input_s_{i} + bias_s +
- *             output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
- *             actGate(forgetGate_{i}) * state_{i-1}
- * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
- *             state_{i-1} * inputCheck
- * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
- *             state_{i} * outputCheck
- * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
- *              state_{i-1} * forgetCheck
- * @endcode
- *
- * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
- * - baisParameter consists of
- *   (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
- *
- * - actInput is defined by config active_type.
- * - actState is defined by config active_state_type.
- * - actGate is defined by config actvie_gate_type.
- *
- * There are two ways to compute, namely one sequence by one sequence or
- * one batch by one batch. By default and no setting pre_batch_state true,
- * it will compute batch by batch.
- *
- * The formula in the paper is as follows:
- * \f[
- * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
- * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
- * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
- * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
- * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
- * h_t = o_t tanh(c_t)
- * \f]
- *
- * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
- * operations on the input sequence were NOT included in LstmLayer. So
- * users should use fc_layer or mixed_layer before lstm_later.
- *
- * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
- * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
- */
-
-class LstmLayer : public Layer, public LstmCompute {
- public:
-  explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
-
-  bool init(const LayerMap &layerMap,
-            const ParameterMap &parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback &callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  /**
-   * @brief Compute lstm forward one sequence by one sequence.
-   * @param batchSize The batchSize is not equal to the batch_size in
-   * the config file. It is the total words number of all samples
-   * in this forward batch.
-   * @param numSequences The sample number. It is equal to the batch_size
-   * in the config file.
-   * @param starts Each start position of each samples.
-   * @param inputValue The input values.
-   */
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int *starts,
-                       MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one sequence by one sequence.
-   */
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int *starts,
-                        MatrixPtr inputGrad);
-
-  /**
-   * Compute lstm forward one batch by one batch. The batch value is
-   * reorganized by SequenceToBatch class. The batch output value will
-   * be convert into sequence value after finishing forward. Here, one
-   * batch contains one word of each sample. If the length of each sample
-   * is not equality, the batch will not pads zero and contains less words.
-   * The total batch numbers are the max length of the sequence. The details
-   * can refer to SequenceToBatch class. On GPU mode, it will launch GPU
-   * kernel for loop.
-   *
-   * @code
-   * for (int i = 0; i < numBatch(max_sequence_length); ++i) {
-   *   compute one batch.
-   * }
-   * @endcode
-   */
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int *starts,
-                    MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one batch by one batch.
-   */
-  void backwardBatch(int batchSize,
-                     size_t numSequences,
-                     const int *starts,
-                     MatrixPtr inputGrad);
-
-  /**
-   * This function only supports GPU. It not need to reorganize input into
-   * batch value. It will launch one kernel to parallelly compute forward
-   * propagation in sequence level.
-   */
-  void forwardSeqParallel(int batchSize,
-                          size_t numSequences,
-                          const int *starts,
-                          MatrixPtr inputValue);
-  /**
-   * Backward propagation corresponding to forwardSeqParallel.
-   */
-  void backwardSeqParallel(int batchSize,
-                           size_t numSequences,
-                           const int *starts,
-                           MatrixPtr inputGrad);
-  /**
-   * This function is used for sequence generation and get output after
-   * forwardBatch.
-   */
-  void getPrevBatchOutput(size_t numSequences);
-  /**
-   * This function is used for sequence generation and get state after
-   * forwardBatch.
-   */
-  void getPrevBatchState(size_t numSequences);
-
- protected:
-  /// Learned parameters, shape: (size, 4*size).
-  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
-  std::unique_ptr<Weight> weight_;
-  /// Learned bias parameter, shape: (1, 7 * size).
-  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
-  /// W_{co}\f$.
-  std::unique_ptr<Weight> bias_;
-  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
-  MatrixPtr localBias_;
-  /// The peephole connection for input gate.
-  MatrixPtr checkIg_;
-  /// The peephole connection for forget gate.
-  MatrixPtr checkFg_;
-  /// The peephole connection for output gate.
-  MatrixPtr checkOg_;
-  /// The gradient of real bias
-  MatrixPtr localBiasGrad_;
-  /// The gradient of peephole connection for input gates.
-  MatrixPtr checkIgGrad_;
-  /// The gradient of peephole connection for forget gates.
-  MatrixPtr checkFgGrad_;
-  /// The gradient of peephole connection for output gates.
-  MatrixPtr checkOgGrad_;
-
-  /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
-  Argument state_;
-  /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
-  Argument preOutput_;
-  /// Stores the value and gradient of four gates, namely
-  /// \f$i_t, f_t, o_t, c_t\f$.
-  Argument gate_;
-  /// Whether it is reversed lstm.
-  bool reversed_;
-  /// Whether to use batch method to compute.
-  bool useBatch_;
-  /// Whether to use sequence parallell method to compute.
-  bool useSeqParallel_;
-  /// batchValue_ is used in method of batch calculation. It stores the
-  /// batch value after reorganized input.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// The gradient of batchValue_.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-
-  /// Used in generation and stores the state of previous time step.
-  MatrixPtr prevState_;
-  /// Used in generation and stores the output of previous time step.
-  MatrixPtr prevOutput_;
-  MatrixPtr prevBatchOutput2_;
-  /// The total state.
-  MatrixPtr totalState_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmStepLayer.cpp b/paddle/legacy/gserver/layers/LstmStepLayer.cpp
deleted file mode 100644
index f02f8ad62..000000000
--- a/paddle/legacy/gserver/layers/LstmStepLayer.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "LstmCompute.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/*
- * LstmStepLayer used in recurrent layer group.
- */
-class LstmStepLayer : public Layer, public LstmCompute {
- protected:
-  Argument state_;
-  Argument gate_;
-  Argument stateActive_;
-  MatrixPtr checkIg_, checkFg_, checkOg_;
-  MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
-  std::unique_ptr<Weight> weight_;
-
- public:
-  explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~LstmStepLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(lstm_step, LstmStepLayer);
-
-bool LstmStepLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(2U, inputLayers_.size());
-
-  checkIg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkFg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkOg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkIgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-  checkFgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-  checkOgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-    weight_.reset(new Weight(1, getSize() * 3, biasParameter_));
-    if (weight_->getW()) {
-      real* data = weight_->getW()->getData();
-      checkIg_->setData(data);
-      checkFg_->setData(data + getSize());
-      checkOg_->setData(data + getSize() * 2);
-    }
-
-    if (weight_->getWGrad()) {
-      real* data = weight_->getWGrad()->getData();
-      checkIgGrad_->setData(data);
-      checkFgGrad_->setData(data + getSize());
-      checkOgGrad_->setData(data + getSize() * 2);
-    }
-  }
-
-  setOutput("state", &state_);
-  LstmCompute::init(config_);
-  return true;
-}
-
-void LstmStepLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("LstmRecurrentFwTime", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const Argument& prevState = getInput(1);
-  CHECK_EQ(getSize() * 4, input.value->getWidth());
-  CHECK_EQ(getSize(), prevState.value->getWidth());
-  int batchSize = input.getBatchSize();
-  reserveOutput(batchSize, getSize());
-  resetSpecifyOutput(state_,
-                     batchSize,
-                     getSize(),
-                     /*  isValueClean */ false,
-                     /* isGradClean */ true);
-  resetSpecifyOutput(gate_,
-                     batchSize,
-                     getSize() * 4,
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  resetSpecifyOutput(stateActive_,
-                     batchSize,
-                     getSize(),
-                     /*  isValueClean */ false,
-                     /* isGradClean */ false);
-  gate_.value->assign(*input.value);
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.prevStateValue = prevState.value->getData();
-  lstmValue.stateActiveValue = stateActive_.value->getData();
-  lstmValue.outputValue = output_.value->getData();
-
-  if (useGpu_) {
-    LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
-  } else {
-    LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
-  }
-}
-
-void LstmStepLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("LstmRecurrentBwTime", getName().c_str());
-  const Argument& input = getInput(0);
-  const Argument& prevState = getInput(1);
-  int batchSize = input.getBatchSize();
-
-  hl_lstm_value lstmValue;
-  hl_lstm_grad lstmGrad;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.prevStateValue = prevState.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = stateActive_.value->getData();
-
-  lstmGrad.gateGrad = gate_.grad->getData();
-  if (prevState.grad) {
-    lstmGrad.prevStateGrad = prevState.grad->getData();
-  } else {
-    lstmGrad.prevStateGrad = nullptr;
-  }
-  lstmGrad.stateGrad = state_.grad->getData();
-  lstmGrad.stateActiveGrad = stateActive_.grad->getData();
-  lstmGrad.outputGrad = output_.grad->getData();
-  lstmGrad.checkIgGrad = checkIgGrad_->getData();
-  lstmGrad.checkFgGrad = checkFgGrad_->getData();
-  lstmGrad.checkOgGrad = checkOgGrad_->getData();
-
-  if (useGpu_) {
-    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize);
-  } else {
-    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize);
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-
-  if (weight_) {
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MDLstmLayer.cpp b/paddle/legacy/gserver/layers/MDLstmLayer.cpp
deleted file mode 100644
index 4838183e8..000000000
--- a/paddle/legacy/gserver/layers/MDLstmLayer.cpp
+++ /dev/null
@@ -1,769 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-class CoordIterator {
- public:
-  std::vector<int> dims_;
-  std::vector<bool> directions_;
-  std::vector<int> curPos_;
-  bool end_;
-
-  void step(size_t d, bool reversed) {
-    if (directions_[d] ^ reversed) {
-      if (curPos_[d] == dims_[d] - 1) {
-        curPos_[d] = 0;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]++;
-      }
-    } else {
-      if (curPos_[d] == 0) {
-        curPos_[d] = dims_[d] - 1;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]--;
-      }
-    }
-  }
-
- public:
-  CoordIterator(std::vector<int> dim, std::vector<bool> directions)
-      : dims_(dim), directions_(directions), end_(false) {
-    CHECK_EQ(dims_.size(), directions_.size());
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_.push_back(-1);
-    }
-  }
-  CoordIterator& operator++() {
-    step(dims_.size() - 1, false);
-    return *this;
-  }
-
-  CoordIterator& operator--() {
-    step(dims_.size() - 1, true);
-    return *this;
-  }
-
-  std::vector<int>& curPos() { return curPos_; }
-
-  int offset() {
-    int offset = curPos_[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + curPos_[i];
-    }
-    return offset;
-  }
-
-  int offset(const std::vector<int>& pos) {
-    int offset = pos[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + pos[i];
-    }
-    return offset;
-  }
-
-  std::vector<int>& begin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? 0 : dims_[i] - 1;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  std::vector<int>& rbegin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? dims_[i] - 1 : 0;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  bool end() { return end_; }
-
-  bool getPrePos(const std::vector<int>& delays,
-                 int idx,
-                 std::vector<int>& prePos) {
-    bool isAvial = true;
-    prePos.clear();
-    prePos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        prePos.push_back(curPos_[i] + delays[i] * (directions_[i] ? 1 : -1));
-        if (prePos[i] < 0) {
-          prePos[i] = 0;
-          isAvial = false;
-        }
-        if (prePos[i] >= dims_[i]) {
-          prePos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        prePos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-
-  bool getNextPos(const std::vector<int>& delays,
-                  int idx,
-                  std::vector<int>& nextPos) {
-    bool isAvial = true;
-    nextPos.clear();
-    nextPos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        nextPos.push_back(curPos_[i] - delays[i] * (directions_[i] ? 1 : -1));
-        if (nextPos[i] < 0) {
-          nextPos[i] = 0;
-          isAvial = false;
-        }
-        if (nextPos[i] >= dims_[i]) {
-          nextPos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        nextPos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-};
-/*
- * MDLstmLayer takes 1 input layer with size * (3+numDims).
- * For each sequence [start, end] it performs the following computation:
- * out_i = actState(state_i) * actGate(outputGate_i)
- *
- * For example the image with 2 dims, we take the scanning order from left-top
- * to right-bottom, then the 2 previous states of the current pixels are the
- * ones located at left and top. And each of them has a independent forget gate.
- *
- * state_i = actInput(input_i) * actGate(inputGate_i) +
- *           \sum{j}(actGate(forgetGate_i_j) * state_prev_i_j)
- *
- * inputGate = input_i * inputW + \sum{j}(output_prev_i_j * recurrInputW_j) +
- *             \sum{j}(state_prev_i_j * inputCheck_j)
- *
- * ouputGate = input_i * outputW + \sum{j}(output_prev_i_j * recurrOutputW_j) +
- *             state_i * outputCheck
- *
- * forgetGate_j = input_i * forgetW_j + \sum{j}(output_prev_i_j *
- *                recurrForgetW_j) + \sum{j}(state_prev_i_j * forgetCheck_j)
- *
- * IG Layer: (Input, InputGate, ForgetGates, OutputGate) * OutputSize
- * */
-
-class MDLstmLayer : public LstmLayer {
- public:
-  explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  void forwardOneSequence(int start, CoordIterator& coordIter);
-  void backwardOneSequence(int start, CoordIterator& coordIter);
-  void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
-  void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
-
- protected:
-  std::vector<Argument> frameInputGate_;
-  std::vector<Argument> frameForgetGate_;
-  std::vector<Argument> frameOutputGate_;
-  std::vector<Argument> frameInputNode_;
-  std::vector<Argument> frameGate_;
-  std::vector<Argument> frameState_;
-  std::vector<Argument> framePreOutput_;
-  std::vector<Argument> frameOutput_;
-
-  // Activation
-  std::unique_ptr<ActivationFunction> activationGate_;
-  std::unique_ptr<ActivationFunction> activationState_;
-
-  int numDims_;
-  size_t numBlocks_;
-  std::vector<bool> directions_;
-  std::vector<int> delays_;
-  std::vector<std::vector<int>> dimsV_;
-};
-
-REGISTER_LAYER(mdlstmemory, MDLstmLayer);
-
-bool MDLstmLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-
-  numBlocks_ = getSize();
-  numDims_ = config_.directions_size();
-  CHECK_EQ(numBlocks_ * numBlocks_ * (3 + numDims_), parameters_[0]->getSize());
-
-  // inode(1), ig(1), fg(numDims_), og(1), peepIg(1), peepFg(numDims_),
-  // peepOg(1), then size of localBias_ is 3+numDims_
-  CHECK_EQ(numBlocks_ * (5 + 2 * numDims_), biasParameter_->getSize());
-  weight_.reset(
-      new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
-    localBias_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                numBlocks_ * (3 + numDims_),
-                                /* trans= */ false,
-                                useGpu_);
-    checkIg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkFg_ = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkOg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    localBiasGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    numBlocks_ * (3 + numDims_),
-                                    /* trans= */ false,
-                                    useGpu_);
-    checkIgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkFgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ numDims_,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkOgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-
-    localBias_->setData(bias_->getW()->getData());
-    checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
-    checkFg_->setData(bias_->getW()->getData() + numBlocks_ * (4 + numDims_));
-    checkOg_->setData(bias_->getW()->getData() +
-                      numBlocks_ * (4 + 2 * numDims_));
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (3 + numDims_));
-      checkFgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + numDims_));
-      checkOgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + 2 * numDims_));
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  for (int i = 0; i < numDims_; i++) {
-    directions_.push_back(config_.directions(i));
-  }
-  for (int i = 0; i < numDims_; i++) {
-    delays_.push_back(-1);
-  }
-  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
-  activationState_.reset(
-      ActivationFunction::create(config_.active_state_type()));
-
-  return true;
-}
-
-void MDLstmLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  int numSequences = input.getNumSequences();
-  resetOutput(batchSize, numBlocks_);
-  CHECK_EQ(numBlocks_ * (3 + numDims_), input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  int* dimsData = input.cpuSequenceDims->getData();
-  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
-
-  for (int i = 0; i < numSequences; i++) {
-    std::vector<int> dims;
-    for (int j = 0; j < numDims_; j++) {
-      dims.push_back(dimsData[i * numDims_ + j]);
-    }
-    dimsV_.push_back(dims);
-  }
-
-  frameInputGate_.reserve(batchSize);
-  frameForgetGate_.reserve(batchSize);
-  frameOutputGate_.reserve(batchSize);
-  frameInputNode_.reserve(batchSize);
-  frameGate_.reserve(batchSize);
-  frameState_.reserve(batchSize);
-  framePreOutput_.reserve(batchSize);
-  frameOutput_.reserve(batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = frameGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_ * (3 + numDims_),
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_ * (3 + numDims_),
-                              /* trans= */ false,
-                              useGpu_);
-    frameGate_.push_back(arg);
-  }
-  for (int i = frameInputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputGate_.push_back(arg);
-  }
-  for (int i = frameForgetGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ numDims_,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameForgetGate_.push_back(arg);
-  }
-  for (int i = frameOutputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutputGate_.push_back(arg);
-  }
-  for (int i = frameInputNode_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputNode_.push_back(arg);
-  }
-  for (int i = frameState_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    frameState_.push_back(arg);
-  }
-  for (int i = framePreOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    framePreOutput_.push_back(arg);
-  }
-  for (int i = frameOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutput_.push_back(arg);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].value->setData(output_.value->getData() + i * numBlocks_);
-    frameGate_[i].value->setData(gate_.value->getData() +
-                                 i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 0);
-    frameInputGate_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 1);
-    frameForgetGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * 2);
-    frameOutputGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * (2 + numDims_));
-  }
-
-  AsyncGpuBlock asyncGpuBlock;
-  gate_.value->assign(*input.value);
-
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  for (int i = 0; i < numSequences; i++) {
-    CoordIterator coordIter(dimsV_[i], directions_);
-    forwardOneSequence(starts[i], coordIter);
-  }
-}
-
-void MDLstmLayer::forwardGate2OutputSequence(int start,
-                                             CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  preOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-  }
-
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      frameInputGate_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim =
-          Matrix::create(checkFg_->getData() + i * numBlocks_,
-                         1.0,
-                         numBlocks_,
-                         false,
-                         useGpu_);
-      fgGateOneDim->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
-    }
-  }
-  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
-  status.check();
-  status = activationGate_->forward(frameForgetGate_[idxCurr]);
-  status.check();
-  status = activation_->forward(frameInputNode_[idxCurr]);
-  status.check();
-
-  frameState_[idxCurr].value->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      frameState_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
-    }
-  }
-  frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
-                                        *frameInputGate_[idxCurr].value,
-                                        1.0,
-                                        1.0);
-
-  frameOutputGate_[idxCurr].value->addDotMul(
-      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
-  status = activationGate_->forward(frameOutputGate_[idxCurr]);
-  status.check();
-
-  framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
-  status = activationState_->forward(framePreOutput_[idxCurr]);
-  status.check();
-
-  frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
-                                      *frameOutputGate_[idxCurr].value);
-}
-
-void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
-  for (coordIter.begin(); !coordIter.end(); ++coordIter) {
-    int offset = coordIter.offset();
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameGate_[start + offset].value->mul(
-            *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0);
-      }
-    }
-    forwardGate2OutputSequence(start, coordIter);
-  }
-}
-
-void MDLstmLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = 0; i < batchSize; i++) {
-    if (frameState_[i].grad == NULL)
-      frameState_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-  for (int i = 0; i < batchSize; i++) {
-    if (framePreOutput_[i].grad == NULL)
-      framePreOutput_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].grad->setData(output_.grad->getData() + i * numBlocks_);
-    frameGate_[i].grad->setData(gate_.grad->getData() +
-                                i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 0);
-    frameInputGate_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 1);
-    frameForgetGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 2);
-    frameOutputGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * (2 + numDims_));
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    for (size_t i = 0; i < numSequences; i++) {
-      CoordIterator coordIter(dimsV_[i], directions_);
-      backwardOneSequence(starts[i], coordIter);
-    }
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void MDLstmLayer::backwardGate2OutputSequence(int start,
-                                              CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  std::vector<int> nextOffsetV;
-  preOffsetV.reserve(numDims_);
-  nextOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-    std::vector<int> nextPos;
-    if (coordIter.getNextPos(delays_, i, nextPos)) {
-      nextOffsetV[i] = coordIter.offset(nextPos);
-    } else {
-      nextOffsetV[i] = -1;
-    }
-  }
-
-  framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                        *frameOutputGate_[idxCurr].value);
-  activationState_->backward(framePreOutput_[idxCurr]).check();
-  frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
-
-  frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                         *framePreOutput_[idxCurr].value);
-  activationGate_->backward(frameOutputGate_[idxCurr]).check();
-
-  frameState_[idxCurr].grad->addDotMul(
-      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
-  for (int i = 0; i < numDims_; i++) {
-    if (nextOffsetV[i] >= 0) {
-      frameState_[idxCurr].grad->addDotMul(
-          *frameInputGate_[start + nextOffsetV[i]].grad, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr fgGateOneDimVal = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].value->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim = Matrix::create(
-          checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
-
-      frameState_[idxCurr].grad->addDotMul(
-          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
-      frameState_[idxCurr].grad->addDotMul(
-          *frameState_[start + nextOffsetV[i]].grad,
-          *fgGateOneDimVal,
-          1.0,
-          1.0);
-    }
-  }
-
-  frameInputNode_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputGate_[idxCurr].value);
-  frameInputGate_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputNode_[idxCurr].value);
-
-  frameForgetGate_[idxCurr].grad->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
-                                  *frameState_[start + preOffsetV[i]].value,
-                                  1.0,
-                                  1.0);
-    }
-  }
-
-  activationGate_->backward(frameInputGate_[idxCurr]).check();
-  activationGate_->backward(frameForgetGate_[idxCurr]).check();
-  activation_->backward(frameInputNode_[idxCurr]).check();
-
-  if (bias_->getWGrad()) {
-    for (int i = 0; i < numDims_; i++) {
-      if (preOffsetV[i] >= 0) {
-        checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
-                                *frameState_[start + preOffsetV[i]].value,
-                                1.0,
-                                1.0);
-
-        MatrixPtr fgGateOneDimGrad = Matrix::create(
-            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-            1,
-            numBlocks_,
-            false,
-            useGpu_);
-        MatrixPtr checkFgOneDimGrad =
-            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
-                           1,
-                           numBlocks_,
-                           false,
-                           useGpu_);
-        checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
-                                     *frameState_[start + preOffsetV[i]].value,
-                                     1.0,
-                                     1.0);
-      }
-    }
-    checkOgGrad_->addDotMul(
-        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
-  }
-}
-
-void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-  for (coordIter.rbegin(); !coordIter.end(); --coordIter) {
-    int offset = coordIter.offset();
-    backwardGate2OutputSequence(start, coordIter);
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameOutput_[start + preOffset].grad->mul(
-            *frameGate_[start + offset].grad, *weightT, 1.0, 1.0);
-        if (weight_->getWGrad()) {
-          weight_->getWGrad()->mul(
-              *frameOutput_[start + preOffset].value->getTranspose(),
-              *frameGate_[start + offset].grad,
-              1.0,
-              1.0);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
deleted file mode 100644
index 544b4082f..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNAddtoLayer.h"
-
-using namespace mkldnn;  // NOLINT
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
-
-bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  layerSize_ = getSize();
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
-  }
-  if (biasParameter_.get() != NULL) {
-    biases_ =
-        std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNAddtoLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
-  reshapeInput(bs, ih, iw);
-  ic = inputLayers_[0]->getSize() / ih / iw;
-  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
-           (size_t)bs * ic * ih * iw);
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
-    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
-  }
-
-  oc = ic;
-  oh = ih;
-  ow = iw;
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
-                                std::vector<MKLDNNMatrixPtr>& inputs,
-                                MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs, biasVal_, out);
-
-  std::shared_ptr<sum::primitive_desc> fwdPD;
-  std::shared_ptr<sum::primitive_desc> biasPD;
-  resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
-}
-
-void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
-                                std::vector<MKLDNNMatrixPtr>& inputs,
-                                MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inputs, biasGrad_, out);
-
-  // backward only need share output grad to input grad
-  for (size_t i = 0; i < inputs.size(); i++) {
-    if (inputs[i] != nullptr) {
-      inputs[i] = out;
-      inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
-    }
-  }
-
-  // backward bias
-  bwdBias_ = nullptr;
-  if (biasGrad_) {
-    std::vector<float> scales(bs_, 1.0);
-    std::vector<memory::primitive_desc> srcPDs(bs_,
-                                               biasGrad_->getPrimitiveDesc());
-    auto biasPD =
-        sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
-    std::vector<primitive::at> srcs;
-    for (size_t i = 0; i < grads_.size(); ++i) {
-      srcs.push_back(*(grads_[i]));
-    }
-    bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
-    pipeline.push_back(*bwdBias_);
-  }
-}
-
-void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
-                                   const MatrixPtr& biasMat,
-                                   const MKLDNNMatrixPtr& out,
-                                   std::vector<MKLDNNMatrixPtr>& outs) {
-  auto pd = MKLDNNMatrix::createPrimitiveDesc(
-      {(int)layerSize_}, memory::format::x, engine_);
-  bias = MKLDNNMatrix::create(pd, biasMat);
-  outs.clear();
-  real* data = out->getData();
-  CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
-  for (int i = 0; i < bs_; ++i) {
-    MatrixPtr tmp =
-        Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
-    outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
-  }
-}
-
-void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                                       MKLDNNMatrixPtr& bias,
-                                       MKLDNNMatrixPtr& out) {
-  inputs.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    resetInValue(inputs[i], nullptr, i);
-    CHECK(inputs[i]);
-    inputs[i]->downSpatial();
-  }
-  for (size_t i = 1; i < inputs.size(); i++) {
-    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
-  }
-
-  resetOutValue(out, inputs[0]->getPrimitiveDesc());
-
-  if (biases_ && biases_->getW()) {
-    prepareBias(bias, biases_->getW(), out, vals_);
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
-                                  std::shared_ptr<sum::primitive_desc>& biasPD,
-                                  std::vector<MKLDNNMatrixPtr>& inputs,
-                                  MKLDNNMatrixPtr bias,
-                                  MKLDNNMatrixPtr out) {
-  std::vector<float> scales(inputs.size(), 1.0);
-  std::vector<memory::primitive_desc> srcPDs;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
-  }
-  CHECK(out);
-  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
-  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
-
-  biasPD = nullptr;
-  if (bias) {
-    std::vector<float> scales(2, 1.0);
-    std::vector<memory::primitive_desc> srcPDs(2, bias->getPrimitiveDesc());
-    biasPD.reset(
-        new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
-    CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
-  }
-}
-
-void MKLDNNAddtoLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<sum::primitive_desc>& pd,
-    std::shared_ptr<sum::primitive_desc>& biasPD,
-    std::vector<MKLDNNMatrixPtr>& inputs,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  std::vector<primitive::at> srcs;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    srcs.push_back(*(inputs[i]));
-  }
-  fwd_.reset(new sum(*pd, srcs, *out));
-  pipeline.push_back(*fwd_);
-
-  fwdBias_.clear();
-  if (biasPD == nullptr || bias == nullptr) {
-    return;
-  }
-  fwdBias_.resize(vals_.size());
-  for (size_t i = 0; i < vals_.size(); ++i) {
-    std::vector<primitive::at> srcs;
-    srcs.push_back(*(vals_[i]));
-    srcs.push_back(*bias);
-    fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
-    pipeline.push_back(*fwdBias_[i]);
-  }
-}
-
-void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                                       MKLDNNMatrixPtr& bias,
-                                       MKLDNNMatrixPtr& out) {
-  CHECK(outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  CHECK(out);
-
-  inputs.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
-    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    prepareBias(bias, biases_->getWGrad(), out, grads_);
-  } else {
-    bias = nullptr;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
deleted file mode 100644
index 0b385e804..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-
-/**
- * @brief A subclass of MKLDNNLayer Addto layer.
- *
- * The config file api is mkldnn_addto
- */
-class MKLDNNAddtoLayer : public MKLDNNLayer {
- protected:
-  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
-  size_t layerSize_;
-
-  std::unique_ptr<Weight> biases_;
-
-  // buffers for adding bias
-  std::vector<MKLDNNMatrixPtr> vals_;
-  std::vector<MKLDNNMatrixPtr> grads_;
-  // primitives for adding bias
-  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
-  std::shared_ptr<mkldnn::primitive> bwdBias_;
-
- public:
-  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
-
-  ~MKLDNNAddtoLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
- protected:
-  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
-                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
-                  std::vector<MKLDNNMatrixPtr>& inputs,
-                  MKLDNNMatrixPtr bias,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
-                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-
-  void prepareBias(MKLDNNMatrixPtr& bias,
-                   const MatrixPtr& biasMat,
-                   const MKLDNNMatrixPtr& out,
-                   std::vector<MKLDNNMatrixPtr>& outs);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNBase.h b/paddle/legacy/gserver/layers/MKLDNNBase.h
deleted file mode 100644
index 786ceaf86..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNBase.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "mkldnn.hpp"
-
-namespace paddle {
-
-typedef enum {
-  MKLDNN_BASE = 1,   // basical info of MKLDNN
-  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
-  MKLDNN_FMTS = 2,   // format info of MKLDNN
-  MKLDNN_SIZES = 3,  // size info of MKLDNN
-  MKLDNN_ALL = 4,    // show all info of MKLDNN
-} MKLDNN_LOG_LEVEL;
-
-/**
- * @brief MKLDNN CPU engine.
- *
- */
-class CPUEngine {
- public:
-  static CPUEngine& Instance() {
-    // Thread-safe in C++11.
-    static CPUEngine myInstance;
-    return myInstance;
-  }
-
-  // Disallow copy or move
-  CPUEngine(const CPUEngine&) = delete;             // Copy constructor
-  CPUEngine(CPUEngine&&) = delete;                  // Move constructor
-  CPUEngine& operator=(const CPUEngine&) = delete;  // Copy assignment
-  CPUEngine& operator=(CPUEngine&&) = delete;       // Move assignment
-
-  mkldnn::engine& getEngine() { return cpuEngine_; }
-
- protected:
-  CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
-  //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
-  ~CPUEngine() {}
-
- private:
-  mkldnn::engine cpuEngine_;
-};
-
-/**
- * @brief MKLDNN Stream.
- *
- */
-class MKLDNNStream {
- public:
-  MKLDNNStream() : ready_(false) { resetState(); }
-
-  virtual ~MKLDNNStream() {}
-
-  /**
-   * @brief Submit stream
-   * @param prims The primitives vector
-   * @param block Waiting for the stream to complete
-   */
-  void submit(std::vector<mkldnn::primitive>& prims, bool block = true) {
-    resetState();
-    stream_->submit(prims).wait(block);
-    ready_ = false;
-  }
-
-  /**
-   * @brief Reset the mkldnn stream
-   */
-  void resetState() {
-    if (ready_) {
-      return;
-    }
-    // TODO(TJ): change me when mkldnn have method to reset this state
-    // stream_.reset(new mkldnn::stream(mkldnn::stream::kind::lazy));
-    stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
-    ready_ = true;
-  }
-
- private:
-  bool ready_;
-  std::shared_ptr<mkldnn::stream> stream_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
deleted file mode 100644
index dbdfaff32..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNBatchNormLayer.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
-
-bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  // first one is input layer
-  // the other two are created in config_parser.py saving moving mean and var
-  CHECK_EQ(inputLayers_.size(), 3U);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
-
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  ic_ = conf.channels();
-  ih_ = inputLayers_[0]->getOutput().getFrameHeight();
-  iw_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (iw_ == 0 && ih_ == 0) {
-    iw_ = conf.img_size();
-    ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  }
-  oc_ = ic_;
-  oh_ = ih_;
-  ow_ = iw_;
-  if (config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-  movingAvgFraction_ = config_.moving_average_fraction();
-  epsilon_ = config_.epsilon();
-
-  VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
-                    << " --- global stats";
-  VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
-
-  initWeight();
-  movingMean_.reset(new Weight(oc_, 1, parameters_[1], 0));
-  movingVar_.reset(new Weight(oc_, 1, parameters_[2], 0));
-  return true;
-}
-
-void MKLDNNBatchNormLayer::initWeight() {
-  weight_.reset(new Weight(1, oc_, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
-  }
-  CHECK_EQ(weight_ != nullptr, biases_ != nullptr)
-      << "only support have both weight and bias, or neither";
-  if (weight_ && weight_->getW()) {
-    CHECK(biases_ && biases_->getW());
-    valueScaleShift_ = Matrix::create(2, oc_, false, false);
-    valueScaleShift_->zeroMem();
-    VectorPtr scale(new CpuVector(oc_, valueScaleShift_->getMemoryHandle(), 0));
-    VectorPtr shift(
-        new CpuVector(oc_, valueScaleShift_->getMemoryHandle(), oc_));
-    const VectorPtr& wgt = parameters_[0]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& bias = biasParameter_->getBuf(PARAMETER_VALUE);
-    scale->copyFrom(*wgt);
-    shift->copyFrom(*bias);
-    wgt->setData(valueScaleShift_->getData());
-    bias->setData(valueScaleShift_->getData() + oc_);
-  }
-  if (weight_ && weight_->getWGrad()) {
-    CHECK(biases_ && biases_->getWGrad());
-    gradScaleShift_ = Matrix::create(2, oc_, false, false);
-    gradScaleShift_->zeroMem();
-    const VectorPtr& wgt = parameters_[0]->getBuf(PARAMETER_GRADIENT);
-    const VectorPtr& bias = biasParameter_->getBuf(PARAMETER_GRADIENT);
-    wgt->setData(gradScaleShift_->getData());
-    bias->setData(gradScaleShift_->getData() + oc_);
-  }
-}
-
-void MKLDNNBatchNormLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-  // prepare mean and var if necessary
-  if (useGlobalStats_) {
-    CHECK(mean_);
-    CHECK(var_);
-    mean_->copyFrom(*(movingMean_->getW()));
-    var_->copyFrom(*(movingVar_->getW()));
-  }
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
-  // calculating and saving moving mean and variance
-  CHECK_EQ(useGlobalStats_, false);
-  movingMean_->getW()->add(
-      *mean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  // here var is v^2
-  movingVar_->getW()->add(*var_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-}
-
-void MKLDNNBatchNormLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-  oh = ih;
-  ow = iw;
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
-                                    std::vector<MKLDNNMatrixPtr>& inputs,
-                                    MKLDNNMatrixPtr& out) {
-  // In training phase, it will always calculate mean and var,
-  // so useGlobalStats must be false.
-  // In scoring phase, it depends on useGlobalStats choice.
-  if (passType_ != PASS_TEST && useGlobalStats_ == true) {
-    LOG(WARNING) << "use_global_stats is invalid setting in training phase";
-    useGlobalStats_ = false;
-  }
-
-  resetFwdBuffers(inputs[0], wgtVal_, out);
-
-  resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
-}
-
-void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
-                                    std::vector<MKLDNNMatrixPtr>& inputs,
-                                    MKLDNNMatrixPtr& out) {
-  std::shared_ptr<bn_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], wgtGrad_, out);
-
-  resetBwdPD(pd, inputs[0], wgtGrad_, out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
-}
-
-void MKLDNNBatchNormLayer::forward(PassType passType) {
-  MKLDNNLayer::forward(passType);
-
-  // calculate and save moving mean and variance
-  if (passType_ != PASS_TEST) {
-    calMovingMeanAndVar();
-  }
-}
-
-void MKLDNNBatchNormLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNBatchNormLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                           MKLDNNMatrixPtr& wgt,
-                                           MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  CHECK(in);
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
-  resetOutValue(out, outPD);
-
-  if (valueScaleShift_) {
-    auto pd = MKLDNNMatrix::createPrimitiveDesc({2, oc_}, format::nc, engine_);
-    resetWithMatrix(wgt, valueScaleShift_, pd);
-  }
-  if (passType_ != PASS_TEST || useGlobalStats_) {
-    auto pd = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
-    mean_ = MKLDNNMatrix::create(pd);
-    var_ = MKLDNNMatrix::create(pd);
-  }
-}
-
-void MKLDNNBatchNormLayer::resetFwdPD(
-    std::shared_ptr<bn_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr in,
-    MKLDNNMatrixPtr wgt,
-    MKLDNNMatrixPtr out) {
-  flags_ = 0u;
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  if (useGlobalStats_) {
-    flags_ = (flags_ | batch_normalization_flag::use_global_stats);
-  }
-  if (wgt) {
-    flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
-  }
-  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
-  pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
-  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
-  if (wgt) {
-    CHECK_PRIMITIVE_DESC_EQ(wgt, pd->weights_primitive_desc());
-  }
-  if (passType_ != PASS_TEST || useGlobalStats_) {
-    CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
-    CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
-  }
-}
-
-void MKLDNNBatchNormLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<bn_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& out) {
-  if (passType_ == PASS_TEST) {
-    if (useGlobalStats_) {
-      fwd_.reset(wgt != nullptr ? new bn_fwd(*pd,
-                                             *in,
-                                             (const primitive::at)(*mean_),
-                                             (const primitive::at)(*var_),
-                                             *wgt,
-                                             *out)
-                                : new bn_fwd(*pd,
-                                             *in,
-                                             (const primitive::at)(*mean_),
-                                             (const primitive::at)(*var_),
-                                             *out));
-    } else {
-      fwd_.reset(wgt != nullptr ? new bn_fwd(*pd, *in, *wgt, *out)
-                                : new bn_fwd(*pd, *in, *out));
-    }
-  } else {
-    CHECK_EQ(useGlobalStats_, false)
-        << "useGlobalStats should be false in training";
-    fwd_.reset(wgt != nullptr ? new bn_fwd(*pd, *in, *wgt, *out, *mean_, *var_)
-                              : new bn_fwd(*pd, *in, *out, *mean_, *var_));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                           MKLDNNMatrixPtr& wgt,
-                                           MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-  if (gradScaleShift_) {
-    CHECK(wgtVal_);
-    resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
-  }
-}
-
-void MKLDNNBatchNormLayer::resetBwdPD(
-    std::shared_ptr<bn_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
-  auto md = in->getMemoryDesc();
-  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
-  pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-  CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
-}
-
-void MKLDNNBatchNormLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<bn_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-  CHECK(inVals_[0]);
-  bwdData_.reset(
-      wgt && wgtVal_
-          ? new bn_bwd(
-                *pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
-          : new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
deleted file mode 100644
index 9aa20df98..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::batch_normalization_forward bn_fwd;
-typedef mkldnn::batch_normalization_backward bn_bwd;
-
-/**
- * @brief A subclass of MKLDNNLayer BatchNorm layer.
- *
- * The config file api is mkldnn_batch_norm
- */
-class MKLDNNBatchNormLayer : public MKLDNNLayer {
- protected:
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
-
-  // Epsilon value used in the batch normalization formula.
-  real epsilon_;
-
-  // weight and bias in paddle
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> biases_;
-  // mkldnn use a large buffer store both scale and shift
-  // which are weight and bias in paddle corresponding.
-  MatrixPtr valueScaleShift_;
-  MatrixPtr gradScaleShift_;
-  // Moving average of mean.
-  std::unique_ptr<Weight> movingMean_;
-  // Moving average of variance.
-  std::unique_ptr<Weight> movingVar_;
-
-  // if useGlobalStats_ is true, will use the loaded mean and variance.
-  // otherwise, calculate mean and variance in every mini-batch.
-  bool useGlobalStats_;
-  // used in MKLDNN primitive desc
-  unsigned flags_;
-  // use to compute moving mean and variance.
-  real movingAvgFraction_;
-  // whether the weight has been init
-  bool hasInitedWgt_;
-
-  // local mean and variance
-  // when useGlobalStats_ they are loaded from moving mean and variance
-  // when do not useGlobalStats_ they are calculated from this mini-batch
-  MKLDNNMatrixPtr mean_;
-  MKLDNNMatrixPtr var_;
-
- public:
-  explicit MKLDNNBatchNormLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), useGlobalStats_(true), hasInitedWgt_(false) {}
-
-  ~MKLDNNBatchNormLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-  void convertWeightsFromPaddle() override;
-
- protected:
-  void initWeight();
-  /**
-   * cal moving mean and variance.
-   * moving = moving * AvgFraction + local * (1 - AvgFraction)
-   */
-  void calMovingMeanAndVar();
-
-  void resetFwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<bn_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr wgt,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<bn_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdPD(std::shared_ptr<bn_bwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr& in,
-                  MKLDNNMatrixPtr& wgt,
-                  MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<bn_bwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
deleted file mode 100644
index beed6176e..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNConcatLayer.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
-
-bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  CHECK_GT(inputLayers_.size(), 1UL);
-  CHECK(!biasParameter_);
-  return true;
-}
-
-void MKLDNNConcatLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-  ic = inputLayers_[0]->getSize() / ih / iw;
-  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
-  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
-           (size_t)bs * ic * ih * iw);
-  CHECK_GT(inputLayers_.size(), 1UL);
-  channels_.resize(inputLayers_.size());
-  channels_[0] = ic;
-  oc = ic;
-  for (size_t i = 1; i < inputLayers_.size(); i++) {
-    int batchsize = 0, height = 0, witdh = 0;
-    reshapeInput(batchsize, height, witdh, i);
-    CHECK_EQ(bs, batchsize);
-    CHECK_EQ(ih, height);
-    CHECK_EQ(iw, witdh);
-
-    channels_[i] = inputLayers_[i]->getSize() / height / witdh;
-    CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
-    oc += channels_[i];
-  }
-  oh = ih;
-  ow = iw;
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
-                                 std::vector<MKLDNNMatrixPtr>& inputs,
-                                 MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs, out);
-
-  std::shared_ptr<concat::primitive_desc> fwdPD;
-  resetFwdPD(fwdPD, inputs, out);
-
-  resetFwdPipeline(pipeline, fwdPD, inputs, out);
-}
-
-void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
-                                 std::vector<MKLDNNMatrixPtr>& inputs,
-                                 MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inputs, out);
-
-  resetBwdPipeline(pipeline, bwds_, inputs, out);
-}
-
-void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                                        MKLDNNMatrixPtr& out) {
-  inputs.resize(inputLayers_.size());
-  bool has8c = false, has16c = false, hasnc = false;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    resetInValue(inputs[i], nullptr, i, channels_[i]);
-    inputs[i]->downSpatial();
-    CHECK(inputs[i]);
-    auto dm = inputs[i]->getDims();
-    // inputs format can be different, but ndims must equal
-    CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
-    CHECK_EQ(bs_, dm[0]);
-    CHECK_EQ(channels_[i], dm[1]);
-    if (dm.size() > 2) {
-      CHECK_EQ(ih_, dm[2]);
-      CHECK_EQ(iw_, dm[3]);
-    }
-    if (inputs[i]->getFormat() == format::nc) {
-      hasnc = true;
-    }
-    if (inputs[i]->getFormat() == format::nChw8c) {
-      has8c = true;
-    }
-    if (inputs[i]->getFormat() == format::nChw16c) {
-      has16c = true;
-    }
-  }
-
-  format outFmt;
-  if (has16c && oc_ % 16 == 0) {
-    outFmt = format::nChw16c;
-  } else if (has8c && oc_ % 8 == 0) {
-    outFmt = format::nChw8c;
-  } else if (hasnc) {
-    CHECK(oh_ == 1 && ow_ == 1);
-    outFmt = format::nc;
-  } else {
-    outFmt = format::nchw;
-  }
-  memory::dims outDims =
-      hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
-  auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
-  resetOutValue(out, outPD);
-}
-
-void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
-                                   std::vector<MKLDNNMatrixPtr>& inputs,
-                                   MKLDNNMatrixPtr out) {
-  std::vector<memory::primitive_desc> srcPDs;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
-  }
-  CHECK(out);
-  pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
-  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
-}
-
-void MKLDNNConcatLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<concat::primitive_desc>& pd,
-    std::vector<MKLDNNMatrixPtr>& inputs,
-    MKLDNNMatrixPtr& out) {
-  std::vector<primitive::at> srcs;
-  for (size_t i = 0; i < inputs.size(); i++) {
-    srcs.push_back(*(inputs[i]));
-  }
-  fwd_.reset(new concat(*pd, srcs, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                                        MKLDNNMatrixPtr& out) {
-  CHECK(outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  CHECK(out);
-
-  inputs.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    CHECK(inVals_[i]);
-    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
-    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
-  }
-}
-
-void MKLDNNConcatLayer::resetBwdPipeline(
-    std::vector<mkldnn::primitive>& pipeline,
-    std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
-    std::vector<MKLDNNMatrixPtr>& inputs,
-    MKLDNNMatrixPtr& out) {
-  // reset the backward primitives
-  memory::dims offsets = {0, 0, 0, 0};
-  prims.resize(inputs.size());
-  CHECK_EQ(inputs.size(), channels_.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    auto viewPD = view::primitive_desc(
-        out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
-    auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
-                                         inputs[i]->getPrimitiveDesc());
-    prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
-    offsets[axis_] += channels_[i];
-    // push to pipeline
-    pipeline.push_back(*prims[i]);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
deleted file mode 100644
index d7738df6c..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-
-/**
- * @brief A subclass of MKLDNNLayer Concatenate layer.
- *
- * The config file api is mkldnn_concat
- */
-class MKLDNNConcatLayer : public MKLDNNLayer {
- protected:
-  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
-  // input channel numbers
-  std::vector<int> channels_;
-
-  // concat_dimension in MKLDNN
-  // if axis_ == 0, concat batchsize
-  // if axis_ == 1, concat channel (default)
-  int axis_;
-
- public:
-  explicit MKLDNNConcatLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), axis_(1) {}
-
-  ~MKLDNNConcatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void printSizeInfo() override {
-    CHECK_EQ(channels_.size(), inputLayers_.size());
-    for (size_t i = 0; i < channels_.size(); ++i) {
-      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
-                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
-                         << ", " << iw_;
-    }
-    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
-                       << ", " << ow_;
-  }
-
-  size_t keepCondition() {
-    // reset when the total element size of all inputs changed
-    size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
-    for (size_t i = 1; i < inputLayers_.size(); ++i) {
-      totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
-    }
-    return totalSize;
-  }
-
- protected:
-  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
-                  std::vector<MKLDNNMatrixPtr>& inputs,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
deleted file mode 100644
index b47bf1482..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNConvLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
-
-bool MKLDNNConvLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK(config_.shared_biases()) << "Only support shared biases yet";
-
-  oc_ = config_.num_filters();
-  const ConvConfig& conf = config_.inputs(0).conv_conf();
-  ic_ = conf.channels();
-  fw_ = conf.filter_size();
-  fh_ = conf.filter_size_y();
-  pw_ = conf.padding();
-  ph_ = conf.padding_y();
-  dw_ = conf.dilation();
-  dh_ = conf.dilation_y();
-  sw_ = conf.stride();
-  sh_ = conf.stride_y();
-  gp_ = conf.groups();
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  caffeMode_ = conf.caffe_mode();
-  CHECK(caffeMode_) << "Only support caffe mode yet";
-  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
-  // check group setting
-  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
-  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
-
-  // create weight
-  size_t height = oc_ / gp_;
-  size_t width = ic_ * fh_ * fw_;
-  CHECK_EQ(parameters_[0]->getSize(), height * width);
-  weight_ =
-      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
-
-  // create biases
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNConvLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-
-  CHECK(wgtVal_) << "should have been initialized";
-  // the paddle weight format is oihw or goihw
-  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNConvLayer::convertWeightsToPaddle() {
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-}
-
-void MKLDNNConvLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-
-  // cal output sizes
-  // oc can not be changed
-  int fh = (fh_ - 1) * dh_ + 1;
-  int fw = (fw_ - 1) * dw_ + 1;
-  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
-  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
-
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdPD(fwdPD_);
-
-  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-}
-
-void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
-  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
-
-  resetBwdWgtPD(bwdWgtPD);
-
-  resetBwdDataPD(bwdDataPD);
-
-  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-
-  resetBwdPipeline(
-      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-}
-
-void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
-                                       memory::dims& bias,
-                                       memory::dims& stride,
-                                       memory::dims& dilation,
-                                       memory::dims& padL,
-                                       memory::dims& padR) {
-  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
-                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
-  bias = memory::dims{oc_};
-  stride = memory::dims{sh_, sw_};
-  padL = memory::dims{ph_, pw_};
-  padR = getPaddingR();
-  // note: mkldnn dilation start from 0
-  dilation = memory::dims{dh_ - 1, dw_ - 1};
-}
-
-void MKLDNNConvLayer::resetFwdPD(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
-  // dims for conv
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  conv_fwd::desc fwdDesc =
-      biases_ && biases_->getW()
-          ? conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(biasDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind)
-          : conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind);
-  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
-}
-
-void MKLDNNConvLayer::resetFwdBuffers(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(pd);
-  resetInValue(
-      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
-
-  resetOutValue(out, pd->dst_primitive_desc());
-
-  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
-
-  if (biases_ && biases_->getW()) {
-    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNConvLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  if (bias) {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
-  } else {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNConvLayer::resetBwdWgtPD(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  // create backward weight using input, output and weight value memory desc
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  CHECK(wgtVal_) << "Should have weight value";
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  auto bwdWgtDesc = biasVal_ != nullptr
-                        ? conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            biasVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind)
-                        : conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind);
-  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      pd->diff_weights_primitive_desc(),
-      "primitive desc of weight value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdDataPD(
-    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
-  pd = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  // create backward data using input and output value memory desc
-  // but using weight memory desc with any format
-  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
-                                        inVals_[0]->getMemoryDesc(),
-                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
-                                        outVal_->getMemoryDesc(),
-                                        strides,
-                                        padL,
-                                        padR,
-                                        padding_kind::zero);
-  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(
-      inVals_[0],
-      pd->diff_src_primitive_desc(),
-      "primitive desc of in value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdBuffers(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(wgtPD);
-  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
-
-  resetWithMatrix(
-      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      wgt->getPrimitiveDesc(),
-      "primitive desc of weight grad and value should be equal");
-
-  bias = nullptr;
-  if (biases_ && biases_->getWGrad()) {
-    resetWithMatrix(
-        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
-    CHECK(bias);
-    CHECK_PRIMITIVE_DESC_EQ(
-        biasVal_,
-        bias->getPrimitiveDesc(),
-        "primitive desc of bias grad and value should be equal");
-  }
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  resetInGrad(in, dataPD->diff_src_primitive_desc());
-  resetWgtValBwdData(dataPD, wgtValBwdData_);
-}
-
-void MKLDNNConvLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  // add bwdWgt handle
-  if (bias) {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
-  }
-  pipeline.push_back(*bwdWgt_);
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  if (cvtWgtVal_) {
-    pipeline.push_back(*cvtWgtVal_);
-  }
-  // add bwdData handle
-  CHECK(wgtValBwdData_) << "Should have weight memory";
-  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-void MKLDNNConvLayer::resetWgtValBwdData(
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& wgt) {
-  if (dataPD == nullptr) {
-    return;
-  }
-
-  // create new weight value for backward data, and create reorder if necessary
-  // since the primitive_desc would be different with wgtVal_
-  CHECK(wgtVal_) << "should have weight value";
-  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
-    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
-    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
-    CHECK(cvtWgtVal_);
-  } else {
-    wgtValBwdData_ = wgtVal_;
-  }
-  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
-                    << wgtValBwdData_->getFormat();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConvLayer.h b/paddle/legacy/gserver/layers/MKLDNNConvLayer.h
deleted file mode 100644
index d399035ed..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNConvLayer.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::convolution_forward conv_fwd;
-typedef mkldnn::convolution_backward_weights conv_bwdWgt;
-typedef mkldnn::convolution_backward_data conv_bwdData;
-
-/**
- * @brief A subclass of MKLDNNLayer conv layer.
- *
- * The config file api is mkldnn_conv
- */
-class MKLDNNConvLayer : public MKLDNNLayer {
- protected:
-  // padding height and width
-  int ph_, pw_;
-  // stride height and width
-  int sh_, sw_;
-  // dilation height and width
-  int dh_, dw_;
-  // filter(kenerl) height and width
-  int fh_, fw_;
-  // group number
-  int gp_;
-
-  // in resetBwdData, the format of wgtValBwdData_ is different with wgtVal_
-  MKLDNNMatrixPtr wgtValBwdData_;
-  // convert handle from wgtVal_ to wgtValBwdData_
-  std::shared_ptr<mkldnn::reorder> cvtWgtVal_;
-
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
-
-  // whether the weight has been init
-  bool hasInitedWgt_;
-
-  // true by default, which impact the calculation of output image size.
-  // details can refer to mathUtil.h
-  bool caffeMode_;
-
-  // weight and bias
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit MKLDNNConvLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
-
-  ~MKLDNNConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-  void convertWeightsFromPaddle() override;
-
-  void convertWeightsToPaddle() override;
-
-  void printSizeInfo() override {
-    MKLDNNLayer::printSizeInfo();
-    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
-                       << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
-                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
-  }
-
- protected:
-  /**
-   * load the dims settings of this conv
-   */
-  void loadConvSettings(mkldnn::memory::dims& wgt,
-                        mkldnn::memory::dims& bias,
-                        mkldnn::memory::dims& stride,
-                        mkldnn::memory::dims& dilation,
-                        mkldnn::memory::dims& padL,
-                        mkldnn::memory::dims& padR);
-
-  void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
-  void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                       MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
-  void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
-  void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                       std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                       MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-
-  /**
-   * reset MKLDNNMatrix of weight value for backward data
-   * since the primitive_desc would be different with wgtVal_
-   */
-  void resetWgtValBwdData(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                          MKLDNNMatrixPtr& wgt);
-
-  /**
-   * get padding_r according to
-   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-   * test_convolution_forward_common.hpp
-   * @note: mkldnn dilation start from 0 while paddle start from 1
-   */
-  mkldnn::memory::dims getPaddingR() const {
-    mkldnn::memory::dims padR = {ph_, pw_};
-    for (int i = 0; i < 2; ++i) {
-      if ((ih_ - ((fh_ - 1) * dh_ + 1) + ph_ + padR[0]) / sh_ + 1 != oh_) {
-        ++padR[0];
-      }
-      if ((iw_ - ((fw_ - 1) * dw_ + 1) + pw_ + padR[1]) / sw_ + 1 != ow_) {
-        ++padR[1];
-      }
-    }
-    return padR;
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
deleted file mode 100644
index f3747c7db..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNFcLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
-
-bool MKLDNNFcLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
-
-  // output size, cat not be changed
-  oc_ = getSize();
-  oh_ = 1;
-  ow_ = 1;
-  ih_ = 1;
-  iw_ = 1;
-
-  // input size can not change in FC
-  iLayerSize_ = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
-
-  // create weight
-  weight_ =
-      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
-
-  // create biases
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNFcLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
-  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNFcLayer::convertWeightsToPaddle() {
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
-  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-}
-
-void MKLDNNFcLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-
-  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
-  ic = iLayerSize_ / (ih * iw);
-  CHECK_EQ(size_t(ic * ih * iw), iLayerSize_) << "not divisible";
-  CHECK_EQ(size_t(oc), getSize());
-
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc);
-}
-
-void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
-                             std::vector<MKLDNNMatrixPtr>& inputs,
-                             MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-}
-
-void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
-                             std::vector<MKLDNNMatrixPtr>& inputs,
-                             MKLDNNMatrixPtr& out) {
-  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
-  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
-
-  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
-
-  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
-
-  resetBwdDataPD(bwdDataPD, inputs[0], out);
-
-  resetBwdPipeline(
-      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-}
-
-void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
-                                    MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-  CHECK(in);
-  in->downSpatial();
-
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
-  resetOutValue(out, outPD);
-
-  format wgtFmt = format::oihw;
-  if (in->getFormat() == format::nChw8c) {
-    wgtFmt = format::oIhw8i;
-  } else if (in->getFormat() == format::nChw16c) {
-    wgtFmt = format::oIhw16i;
-  }
-  auto wgtPD =
-      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
-  resetWithMatrix(wgt, weight_->getW(), wgtPD);
-  wgt->downSpatial();
-
-  if (biases_ && biases_->getW()) {
-    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
-    resetWithMatrix(bias, biases_->getW(), biasPD);
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                               MKLDNNMatrixPtr in,
-                               MKLDNNMatrixPtr wgt,
-                               MKLDNNMatrixPtr bias,
-                               MKLDNNMatrixPtr out) {
-  CHECK(in);
-  CHECK(wgt);
-  CHECK(out);
-  prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = bias != nullptr ? fc_fwd::desc(pk,
-                                                        in->getMemoryDesc(),
-                                                        wgt->getMemoryDesc(),
-                                                        bias->getMemoryDesc(),
-                                                        out->getMemoryDesc())
-                                         : fc_fwd::desc(pk,
-                                                        in->getMemoryDesc(),
-                                                        wgt->getMemoryDesc(),
-                                                        out->getMemoryDesc());
-  pd.reset(new fc_fwd::primitive_desc(fwdDesc, engine_));
-}
-
-void MKLDNNFcLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<fc_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  if (bias) {
-    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
-  } else {
-    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
-                                    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-
-  CHECK(wgtVal_);
-  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
-
-  if (biasVal_) {
-    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNFcLayer::resetBwdWgtPD(
-    std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  fc_bwdWgt::desc bwdWgtDesc =
-      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
-                             wgt->getMemoryDesc(),
-                             bias->getMemoryDesc(),
-                             out->getMemoryDesc())
-           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
-                             wgt->getMemoryDesc(),
-                             out->getMemoryDesc());
-  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNFcLayer::resetBwdDataPD(
-    std::shared_ptr<fc_bwdData::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK(wgtVal_);
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
-      in->getMemoryDesc(), wgtVal_->getMemoryDesc(), out->getMemoryDesc());
-  pd.reset(new fc_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNFcLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
-    std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  if (bias) {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
-  }
-  pipeline.push_back(*bwdWgt_);
-
-  if (bwdDataPD == nullptr) {
-    return;
-  }
-  CHECK(wgtVal_) << "Should have weight memory";
-  bwdData_.reset(new fc_bwdData(*bwdDataPD, *out, *wgtVal_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNFcLayer.h b/paddle/legacy/gserver/layers/MKLDNNFcLayer.h
deleted file mode 100644
index a704066cc..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNFcLayer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::inner_product_forward fc_fwd;
-typedef mkldnn::inner_product_backward_weights fc_bwdWgt;
-typedef mkldnn::inner_product_backward_data fc_bwdData;
-
-/**
- * @brief A subclass of MKLDNNLayer fc layer.
- *
- * The config file api is mkldnn_fc
- */
-class MKLDNNFcLayer : public MKLDNNLayer {
- protected:
-  // input layer size, can not be change after init
-  size_t iLayerSize_;  // == ic * ih * iw
-
-  // if has already init the weight
-  bool hasInitedWgt_;
-
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<fc_fwd::primitive_desc> fwdPD_;
-
-  // fc weight and bias
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit MKLDNNFcLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false) {}
-
-  ~MKLDNNFcLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-  void convertWeightsFromPaddle() override;
-
-  void convertWeightsToPaddle() override;
-
- protected:
-  void resetFwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr wgt,
-                  MKLDNNMatrixPtr bias,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
-                     MKLDNNMatrixPtr& wgt,
-                     MKLDNNMatrixPtr& bias,
-                     MKLDNNMatrixPtr& out);
-  void resetBwdDataPD(std::shared_ptr<fc_bwdData::primitive_desc>& pd,
-                      MKLDNNMatrixPtr& in,
-                      MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
-                        std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
deleted file mode 100644
index 739482348..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNLRNLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
-
-bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  localSize_ = conf.size();
-  alpha_ = conf.scale();
-  beta_ = conf.pow();
-
-  ic_ = conf.channels();
-  oc_ = ic_;
-  iw_ = conf.img_size();
-  ow_ = conf.output_x();
-  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  CHECK_EQ(iw_, ow_);
-  CHECK_EQ(ih_, oh_);
-  return true;
-}
-
-void MKLDNNLRNLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  reshapeInput(bs, ih, iw);
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-  oh = ih;
-  ow = iw;
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
-                              std::vector<MKLDNNMatrixPtr>& inputs,
-                              MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], out);
-
-  resetFwdPD(fwdPD_, inputs[0], out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
-}
-
-void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
-                              std::vector<MKLDNNMatrixPtr>& inputs,
-                              MKLDNNMatrixPtr& out) {
-  std::shared_ptr<lrn_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], out);
-
-  resetBwdPD(pd, inputs[0], out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], out);
-}
-
-void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                     MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-  CHECK(in);
-  resetOutValue(out, in->getPrimitiveDesc());
-}
-
-void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                                MKLDNNMatrixPtr in,
-                                MKLDNNMatrixPtr out) {
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  auto fwdDesc = lrn_fwd::desc(pk,
-                               algorithm::lrn_across_channels,
-                               in->getMemoryDesc(),
-                               localSize_,
-                               alpha_,
-                               beta_,
-                               1.0f);
-  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
-  // prepare workspace if necessary
-  workspace_ =
-      passType_ != PASS_TEST
-          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
-          : nullptr;
-}
-
-void MKLDNNLRNLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  fwd_ = workspace_
-             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
-             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                     MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-}
-
-void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK(out);
-  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
-                               in->getMemoryDesc(),
-                               out->getMemoryDesc(),
-                               localSize_,
-                               alpha_,
-                               beta_,
-                               1.0f);
-  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNLRNLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-  CHECK(inVals_[0]);
-  CHECK(workspace_);
-  bwdData_ = std::make_shared<lrn_bwd>(
-      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
deleted file mode 100644
index 028438f2c..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::lrn_forward lrn_fwd;
-typedef mkldnn::lrn_backward lrn_bwd;
-
-/**
- * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
- *
- * The config file api is mkldnn_lrn
- */
-class MKLDNNLRNLayer : public MKLDNNLayer {
- protected:
-  // save forward primitive_desc, which can be used in backward
-  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
-  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-  // test_lrn_backward.cpp, lrn need workspace for backward
-  std::shared_ptr<mkldnn::memory> workspace_;
-
-  int localSize_;
-  float alpha_, beta_;  // scale and pow in paddle
-
- public:
-  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
-
-  ~MKLDNNLRNLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
- protected:
-  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr& in,
-                  MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLayer.cpp
deleted file mode 100644
index f0acffe87..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNLayer.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNLayer.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-bool MKLDNNLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
-                          << "Please set WITH_MKL=ON "
-                          << "and set use_mkldnn=True";
-  CHECK(!useGpu_) << "Do not support GPU yet";
-
-  // set device id before Layer::init
-  setDevice(MKLDNN_DEVICE);
-  // change param device to MKLDNN device
-  setParamsDevice(MKLDNN_DEVICE, parameterMap);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setOutputMap();
-  checkCPUOutputsNumber();
-
-  stream_.reset(new MKLDNNStream());
-  engine_ = CPUEngine::Instance().getEngine();
-  return true;
-}
-
-void MKLDNNLayer::forward(PassType passType) {
-  passType_ = passType;
-
-  {
-    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-    CHECK(!inputLayers_.empty());
-    copySeqInfoToOutputs();
-    if (condition_ != keepCondition()) {
-      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-      condition_ = keepCondition();
-      reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
-      printSizeInfo();
-      // the output_.value and output_.grad are shared with CPU device
-      shareCPUDevice();
-      pipelineFwd_.clear();
-      inVals_.resize(inputLayers_.size(), nullptr);
-      extInVals_.resize(inputLayers_.size(), nullptr);
-      cvtInVals_.resize(inputLayers_.size(), nullptr);
-      resetFwd(pipelineFwd_, inVals_, outVal_);
-      prepareValueConversions(pipelineFwd_);
-      convertWeightsFromPaddle();
-      printValueFormat();
-      needResetBwd_ = true;
-    }
-
-    if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
-      // Update input value data when input layer is "data" type,
-      // since the input value data address might be changed.
-      CHECK(extInVals_[0]);
-      extInVals_[0]->setData(getInputValue(0, CPU_DEVICE)->getData());
-    }
-
-    if (!outputOnlyMKLDNN_) {
-      clearGrads();
-    }
-    stream_->submit(pipelineFwd_);
-  }
-  {
-    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MKLDNNLayer::backward(const UpdateCallback& callback) {
-  if (needResetBwd_) {
-    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-    pipelineBwd_.clear();
-    inGrads_.resize(inputLayers_.size(), nullptr);
-    extInGrads_.resize(inputLayers_.size(), nullptr);
-    cvtInGrads_.resize(inputLayers_.size(), nullptr);
-    pipelineMergeGrad_.clear();
-    mergeGrad_ = nullptr;
-    resetBwd(pipelineBwd_, inGrads_, outGrad_);
-    prepareGradConversions(pipelineBwd_);
-    printGradFormat();
-    needResetBwd_ = false;
-  }
-
-  // merge grad must before backward activation
-  if (mergeGrad_) {
-    REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
-    stream_->submit(pipelineMergeGrad_);
-  }
-  {
-    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
-    backwardActivation();
-  }
-  {
-    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
-    stream_->submit(pipelineBwd_);
-  }
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    updateWeights(callback);
-  }
-}
-
-void MKLDNNLayer::reshapeInput(int& batchsize,
-                               int& height,
-                               int& width,
-                               size_t idx) {
-  const Argument& input = inputLayers_[idx]->getOutput();
-  batchsize = input.getBatchSize();
-  int h = input.getFrameHeight();
-  int w = input.getFrameWidth();
-  if (h != 0) {
-    height = h;
-  }
-  if (w != 0) {
-    width = w;
-  }
-  height = height != 0 ? height : 1;
-  width = width != 0 ? width : 1;
-}
-
-void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
-  output_.setFrameHeight(height);
-  output_.setFrameWidth(width);
-  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-    outputOtherDevice_[i].setFrameHeight(height);
-    outputOtherDevice_[i].setFrameWidth(width);
-  }
-}
-
-void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
-                                  const MatrixPtr& mat,
-                                  memory::primitive_desc pd) {
-  dnn = nullptr;
-  if (mat == nullptr) {
-    return;
-  }
-  dnn = MKLDNNMatrix::create(pd, mat);
-}
-
-void MKLDNNLayer::resetInValue(
-    MKLDNNMatrixPtr& in,
-    const std::shared_ptr<memory::primitive_desc>& intPD,
-    size_t idx,
-    int inputChannel) {
-  cvtInVals_[idx] = nullptr;
-  extInVals_[idx] = nullptr;
-  in = nullptr;
-  inputChannel = inputChannel == 0 ? ic_ : inputChannel;
-  CHECK_GT(bs_ * inputChannel * ih_ * iw_, 0);
-  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
-      {bs_, inputChannel, ih_, iw_}, format::nchw, engine_);
-  const MatrixPtr& inMat = inputLayers_[idx]->getOutputValue();
-  extInVals_[idx] = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), extInVals_[idx] != nullptr);
-  if (extInVals_[idx] == nullptr ||
-      extInVals_[idx]->getFormat() == format::nc) {
-    extInVals_[idx] = MKLDNNMatrix::create(extPD, inMat);
-  }
-  in = extInVals_[idx];
-  if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
-    return;
-  }
-  // need create reorder
-  in = MKLDNNMatrix::create(*intPD);
-  cvtInVals_[idx] = MKLDNNMatrix::createReorder(extInVals_[idx], in);
-  CHECK(cvtInVals_[idx]) << "should not be emptry";
-}
-
-void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
-                                memory::primitive_desc intPD) {
-  cvtOutVal_ = nullptr;
-  out = MKLDNNMatrix::create(intPD, output_.value);
-  extOutVal_ = out;
-  if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
-    return;
-  }
-  // need create reorder
-  CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
-  extOutVal_ = MKLDNNMatrix::create(
-      memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value);
-  out = MKLDNNMatrix::create(intPD);
-  cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
-  CHECK(cvtOutVal_) << "should not be empty";
-}
-
-void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
-                              memory::primitive_desc intPD,
-                              size_t idx) {
-  cvtInGrads_[idx] = nullptr;
-  extInGrads_[idx] = nullptr;
-  in = nullptr;
-  LayerPtr& input = inputLayers_[idx];
-  if (input->getOutputGrad() == nullptr) {
-    // no need input grad
-    return;
-  }
-  CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
-      << "only support input is MKLDNN layer or only have one output layer";
-  // when input is a mkldnn branch node,
-  // this layer will save input grad to a internal buffer,
-  // and the mkldnn input layer will merge them to actual prev->output_.grad
-  const MatrixPtr& inMat =
-      input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
-  in = MKLDNNMatrix::create(intPD, inMat);
-  Argument& arg = input->getOutput(this->getName());
-  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
-  if (inputIsOnlyMKLDNN()) {
-    return;
-  }
-
-  extInGrads_[idx] = in;
-  if (isPaddleFormat(extInGrads_[idx]->getFormat())) {
-    return;
-  }
-  // need create reorder
-  CHECK(extInVals_[idx] != nullptr &&
-        isPaddleFormat(extInVals_[idx]->getFormat()))
-      << "should have external input value and the format must be nchw(nc)";
-  extInGrads_[idx] =
-      MKLDNNMatrix::create(extInVals_[idx]->getPrimitiveDesc(), inMat);
-  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
-  in = MKLDNNMatrix::create(intPD);
-  cvtInGrads_[idx] = MKLDNNMatrix::createReorder(in, extInGrads_[idx]);
-  CHECK(cvtInGrads_[idx]);
-}
-
-void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
-                               memory::primitive_desc intPD) {
-  cvtOutGrad_ = nullptr;
-  extOutGrad_ = nullptr;
-  out = nullptr;
-  MatrixPtr& outMat = output_.grad;
-  out = MKLDNNMatrix::create(intPD, outMat);
-  resetMergeGrad(out);
-  if (outputIsOnlyMKLDNN()) {
-    return;
-  }
-  CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
-  extOutGrad_ = out;
-  if (isPaddleFormat(extOutGrad_->getFormat())) {
-    return;
-  }
-  // need create reorder
-  CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
-      << "should have external output value and the format must be nchw(nc)";
-  extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
-  CHECK_PRIMITIVE_DESC_EQ(outVal_, intPD);
-  out = MKLDNNMatrix::create(intPD);
-  cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
-  CHECK(cvtOutGrad_);
-}
-
-void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
-  mergeGrad_ = nullptr;
-  pipelineMergeGrad_.clear();
-  if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
-    // do not merge when output is not all MKLDNN or only one output
-    return;
-  }
-  CHECK(out) << "should have reset internal ouput grad";
-  std::vector<float> scales(outputMap_.size(), 1.0);
-  std::vector<memory::primitive_desc> srcPDs;
-  std::vector<primitive::at> srcs;
-  for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
-    MKLDNNMatrixPtr src =
-        std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
-    CHECK(src) << "should be MKLDNNMatrix";
-    auto srcDims = src->getDims();
-    auto dstDims = out->getDims();
-    CHECK_EQ(srcDims.size(), dstDims.size());
-    for (size_t i = 0; i < srcDims.size(); ++i) {
-      CHECK_EQ(srcDims[i], dstDims[i]);
-    }
-    VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first
-                      << ", format " << src->getFormat();
-    srcPDs.push_back(src->getPrimitiveDesc());
-    srcs.push_back(*src);
-  }
-
-  auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs);
-  mergeGrad_.reset(new sum(sumPD, srcs, *out));
-  pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLayer.h
deleted file mode 100644
index 94dc8625f..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNLayer.h
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "MKLDNNBase.h"
-#include "mkldnn.hpp"
-#include "paddle/legacy/math/MKLDNNMatrix.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_bool(use_mkldnn);
-
-namespace paddle {
-
-class MKLDNNLayer;
-typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
-
-/**
- * @brief Base class of MKLDNNlayer.
- *
- */
-class MKLDNNLayer : public Layer {
- protected:
-  // batch size
-  int bs_;
-  // their sizes are always from the first input layer
-  // input image channel, height and width
-  int ic_, ih_, iw_;
-  // output image channel, height and width
-  int oc_, oh_, ow_;
-
-  // the condition that forward need be reset
-  size_t condition_;
-  // backward also need reset after reset forward handle
-  bool needResetBwd_;
-
-  // is output only mkldnn
-  bool outputOnlyMKLDNN_;
-
-  // mkldnn engine, stream and primivtives
-  mkldnn::engine engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwdWgt_;
-  std::shared_ptr<mkldnn::primitive> bwdData_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
-  /* Value and grad are seperated as internal and external buffers.
-   * Each MKLDNNLayer must init or reset internal buffer at least,
-   * and the external buffer format is always nchw of nc(when h==w==1),
-   * which is the same format as paddle.
-   * The output_.value and output_.grad always save the external data,
-   * when mixed with cpu device.
-   * When all layers are mkldnn layers, they could save internal data.
-   */
-  // below MKLDNNMatrix buffers are all internal buffers
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
-  MKLDNNMatrixPtr outVal_;
-  MKLDNNMatrixPtr outGrad_;
-  // below are external value and grad
-  std::vector<MKLDNNMatrixPtr> extInVals_;
-  std::vector<MKLDNNMatrixPtr> extInGrads_;
-  MKLDNNMatrixPtr extOutVal_;
-  MKLDNNMatrixPtr extOutGrad_;
-  // convert handle between external and internal buffers
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
-  // weight and bias are always internal buffers
-  MKLDNNMatrixPtr wgtVal_;
-  MKLDNNMatrixPtr wgtGrad_;
-  MKLDNNMatrixPtr biasVal_;
-  MKLDNNMatrixPtr biasGrad_;
-
-  // merge grad primitive
-  std::shared_ptr<mkldnn::primitive> mergeGrad_;
-  std::vector<mkldnn::primitive> pipelineMergeGrad_;
-  // tmp input argument to save input grad, only used to merge grad
-  Argument tmpInArg_;
-
- public:
-  explicit MKLDNNLayer(const LayerConfig& config)
-      : Layer(config),
-        ih_(0),
-        iw_(0),
-        condition_(0),
-        needResetBwd_(true),
-        outputOnlyMKLDNN_(false),
-        engine_(mkldnn::engine::cpu, 0),
-        stream_(nullptr),
-        fwd_(nullptr),
-        bwdWgt_(nullptr),
-        bwdData_(nullptr) {}
-
-  ~MKLDNNLayer() {}
-
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
-
-  /**
-   * reshape the input and output channels and image sizes
-   * and reset output buffer size
-   */
-  virtual void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
-
-  /**
-   * reset the mkldnn forward primitve and memories
-   * only would be called when input size changes
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * reset the mkldnn backward primitve and memories
-   * only would be called when needed
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * Update weights and biases if necessary.
-   */
-  virtual void updateWeights(const UpdateCallback& callback) {}
-
-  /**
-   * convert weight from paddle format to mkldnn format
-   * weight_ will be override
-   */
-  virtual void convertWeightsFromPaddle() {}
-
-  /**
-   * convert mkldnn weight to paddle format
-   * weight_ will be override
-   */
-  virtual void convertWeightsToPaddle() {}
-
-  /**
-   * add this interface as public for unit test
-   */
-  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
-
- protected:
-  /**
-   * Some layers may have different condition to reset the forward.
-   * The function returns the condition that do not need reset forward.
-   */
-  inline virtual size_t keepCondition() {
-    // reset when the first input element size changed, not only the batchsize
-    return inputLayers_[0]->getOutputValue()->getElementCnt();
-  }
-
-  /**
-   * reshape the input image sizes and input batchsize
-   */
-  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
-
-  /**
-   * reshape output image sizes
-   */
-  void reshapeOutput(size_t height, size_t width);
-
-  /**
-   * reset MKLDNNMatrix from Matrix and internal primitive desc.
-   * reset nullptr if matrix or primitive desc is empty
-   */
-  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
-                       const MatrixPtr& mat,
-                       mkldnn::memory::primitive_desc pd);
-
-  /**
-   * reset input value from input MKLDNNMatrix and internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   * input channel may be different in concat.
-   */
-  void resetInValue(
-      MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
-      size_t idx = 0,
-      int inputChannel = 0);
-
-  /**
-   * reset output value from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetOutValue(MKLDNNMatrixPtr& out,
-                     mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset input grad from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetInGrad(MKLDNNMatrixPtr& in,
-                   mkldnn::memory::primitive_desc intPD,
-                   size_t idx = 0);
-
-  /**
-   * reset output grad from internal primitive desc.
-   * merge grad if necessary.
-   * reset both internal and external buffer and create reorder if necessary.
-   * note: about merge grad, when this layer has several outputs,
-   *       it could not be mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset the merge grad primitive if necessary.
-   * note: do not support the grads mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetMergeGrad(MKLDNNMatrixPtr& out);
-
- protected:
-  /**
-   * Set deviceId of this layer.
-   */
-  void setDevice(int id) { deviceId_ = id; }
-
-  /**
-   * check the format is nchw or nc,
-   * which is supported by Paddle default memory layout
-   */
-  bool isPaddleFormat(mkldnn::memory::format fmt) {
-    if (fmt == mkldnn::memory::format::nchw ||
-        fmt == mkldnn::memory::format::nc) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  /**
-   * If input only has MKLDNN device.
-   * Otherwise, only support the previous layer using CPU device.
-   */
-  bool inputIsOnlyMKLDNN(int index = 0) {
-    int prevDevice = getPrev(index)->getDeviceId();
-    if (prevDevice == MKLDNN_DEVICE) {
-      return true;
-    } else {
-      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
-      return false;
-    }
-  }
-
-  /**
-   * If output only has MKLDNN device.
-   * Otherwise, other devices should only using CPU device.
-   */
-  bool outputIsOnlyMKLDNN() {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
-    }
-    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
-    return outputOnlyMKLDNN_;
-  }
-
-  /**
-   * print info about sizes
-   */
-  virtual void printSizeInfo() {
-    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
-                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
-                       << ", oh: " << oh_ << ", ow: " << ow_;
-  }
-
-  /**
-   * print the mkldnn memory format of value
-   */
-  virtual void printValueFormat() {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      if (!inVals_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
-                                                  : inVals_[i]->getFormat())
-                        << " >>> " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
-                        << (extOutVal_ ? extOutVal_->getFormat()
-                                       : outVal_->getFormat());
-    }
-    if (wgtVal_) {
-      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
-    }
-    if (biasVal_) {
-      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
-    }
-  }
-
-  /**
-   * print the mkldnn memory format of grad
-   */
-  virtual void printGradFormat() {
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
-                        << (extOutGrad_ ? extOutGrad_->getFormat()
-                                        : outGrad_->getFormat());
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      if (!inGrads_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
-                                                   : inGrads_[i]->getFormat())
-                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
-    }
-    if (wgtGrad_) {
-      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
-    }
-    if (biasGrad_) {
-      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
-    }
-  }
-
- private:
-  /**
-   * clear all grad
-   */
-  void clearGrads() {
-    if (output_.grad) {
-      output_.grad->zeroMem();
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].grad) {
-        outputOtherDevice_[i].grad->zeroMem();
-      }
-    }
-  }
-
-  /**
-   * Set deviceId of the params used in this layer.
-   */
-  void setParamsDevice(int id, const ParameterMap& parameterMap) {
-    for (auto& inputConfig : config_.inputs()) {
-      if (inputConfig.has_input_parameter_name()) {
-        ParameterPtr parameter;
-        std::string name = inputConfig.input_parameter_name();
-        CHECK(mapGet(name, parameterMap, &parameter))
-            << "Cannot find input parameter " << name << " for layer "
-            << getName();
-        parameter->setDevice(id);
-      }
-    }
-    if (config_.has_bias_parameter_name()) {
-      ParameterPtr parameter;
-      std::string name = config_.bias_parameter_name();
-      CHECK(mapGet(name, parameterMap, &parameter))
-          << "Cannot find bias parameter " << name << " for layer "
-          << getName();
-      parameter->setDevice(id);
-    }
-  }
-
-  /**
-   * Set output map of prev layers.
-   */
-  void setOutputMap() {
-    outputMap_.clear();
-    for (size_t i = 0; i < inputLayers_.size(); ++i) {
-      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
-    }
-  }
-
-  /**
-   * if have cpu device, share value and grad data with output_
-   */
-  void shareCPUDevice() {
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].value = output_.value;
-      outputOtherDevice_[i].grad = output_.grad;
-    }
-  }
-
-  /**
-   * Check the cpu device number of outputOtherDevice_.
-   * should have only one at most.
-   */
-  void checkCPUOutputsNumber(int max = 1) {
-    int cnt = 0;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-        ++cnt;
-      }
-    }
-    CHECK_LE(cnt, max) << "too much CPU devies";
-  }
-
-  /**
-   * copy SeqInfo from input layer to this output and other output devices.
-   * @note: do not use getInput(0) since it used this deviceId_,
-   *        use "inputLayers_[0]->getOutput()" instead.
-   */
-  void copySeqInfoToOutputs() {
-    if (inputLayers_.empty() || !needSequenceInfo_) {
-      return;
-    }
-    const Argument& input = inputLayers_[0]->getOutput();
-    output_.sequenceStartPositions = input.sequenceStartPositions;
-    output_.subSequenceStartPositions = input.subSequenceStartPositions;
-    output_.cpuSequenceDims = input.cpuSequenceDims;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].sequenceStartPositions =
-          output_.sequenceStartPositions;
-      outputOtherDevice_[i].subSequenceStartPositions =
-          output_.subSequenceStartPositions;
-      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-    }
-  }
-
-  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // MKLDNNLayer output value should be MKLDNNMatrix
-    // so external output value is necessary.
-    // Then external input value is not necessary,
-    // since input may be mkldnn internal buffer.
-    CHECK(extOutVal_) << "external output value is necessary";
-    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
-    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
-      if (cvtInVals_[i]) {
-        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
-      }
-    }
-    if (cvtOutVal_) {
-      pipeline.push_back(*cvtOutVal_);
-    }
-  }
-  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // external output grad is not necessary
-    // since output may be mkldnn internal buffer or merge them directly.
-    CHECK(outGrad_) << "internal output grad is necessary";
-    if (extOutGrad_) {
-      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
-          << "the external buffer should share the same data with output_.grad";
-    }
-    if (cvtOutGrad_) {
-      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
-    }
-    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
-      if (cvtInGrads_[i]) {
-        pipeline.push_back(*cvtInGrads_[i]);
-      }
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
deleted file mode 100644
index 83d980538..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNPoolLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
-
-bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  ic_ = conf.channels();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  oc_ = ic_;
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  fh_ = conf.size_y();
-  fw_ = conf.size_x();
-  ph_ = conf.padding_y();
-  pw_ = conf.padding();
-  sh_ = conf.stride_y();
-  sw_ = conf.stride();
-
-  const std::string& type = conf.pool_type();
-  if (type == "max-projection") {
-    poolAlgo_ = algorithm::pooling_max;
-  } else if (type == "avg-projection") {
-    // paddle only use exclude_padding
-    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
-  } else {
-    LOG(FATAL) << "unknow pooling type!";
-  }
-  return true;
-}
-
-void MKLDNNPoolLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-
-  // cal output sizes
-  // paddle used false caffeMode for pooling
-  oh = outputSize(ih, fh_, ph_, sh_, false);
-  ow = outputSize(iw, fw_, pw_, sw_, false);
-  reshapeOutput(oh, ow);
-
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], out);
-
-  resetFwdPD(fwdPD_, inputs[0], out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<pool_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], out);
-
-  resetBwdPD(pd, inputs[0], out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  CHECK(in);
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
-  resetOutValue(out, outPD);
-}
-
-void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr in,
-                                 MKLDNNMatrixPtr out) {
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  padding_kind padKind = padding_kind::zero;
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  auto fwdDesc = pool_fwd::desc(pk,
-                                poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padKind);
-  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
-
-  // prepare workspace if necessary
-  workspace_ =
-      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
-          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
-          : nullptr;
-}
-
-void MKLDNNPoolLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  fwd_ = workspace_
-             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
-             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-}
-
-void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr& in,
-                                 MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  CHECK(out);
-  auto bwdDesc = pool_bwd::desc(poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padding_kind::zero);
-  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNPoolLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-
-  bwdData_ =
-      workspace_
-          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
-          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
deleted file mode 100644
index 1eb0ee4ad..000000000
--- a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::pooling_forward pool_fwd;
-typedef mkldnn::pooling_backward pool_bwd;
-
-/**
- * @brief A subclass of MKLDNNLayer pool layer.
- *
- * The config file api is mkldnn_pool
- */
-class MKLDNNPoolLayer : public MKLDNNLayer {
- protected:
-  // padding height and width
-  int ph_, pw_;
-  // stride height and width
-  int sh_, sw_;
-  // filter(kenerl) height and width
-  int fh_, fw_;
-
-  // pooling_avg or pooling_max
-  mkldnn::algorithm poolAlgo_;
-
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
-  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-  // test_pooling_forward.cpp, pool need workspace for backward
-  std::shared_ptr<mkldnn::memory> workspace_;
-
- public:
-  explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
-
-  ~MKLDNNPoolLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void printSizeInfo() override {
-    MKLDNNLayer::printSizeInfo();
-    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
-                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
-                       << ", sw: " << sw_;
-  }
-
- protected:
-  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr& in,
-                  MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-
-  /**
-   * get padding_r according to
-   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-   * test_pooling_forward.cpp
-   */
-  mkldnn::memory::dims getPaddingR() const {
-    mkldnn::memory::dims padR = {ph_, pw_};
-    for (int i = 0; i < 2; ++i) {
-      if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
-        ++padR[0];
-      }
-      if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
-        ++padR[1];
-      }
-    }
-    return padR;
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
deleted file mode 100644
index d928ebc32..000000000
--- a/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLPackedRecurrentLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
-
-bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
-                                   const ParameterMap& parameterMap) {
-  if (!RecurrentLayer::init(layerMap, parameterMap)) return false;
-  packed_weight_.reset(new MKLPackedWeight(weight_->getW()));
-  packed_weight_->pack();
-  if (needGradient_) {
-    packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true));
-    packed_weightT_->pack();
-  }
-  return true;
-}
-
-void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
-  RecurrentLayer::backward(callback);
-  packed_weight_->pack();
-  if (needGradient_) {
-    packed_weightT_->pack();
-  }
-}
-
-void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
-                                           size_t numSequences,
-                                           const int* starts) {
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
-
-  batchValue_->copyFromSeq(*output_.value);
-
-  {
-    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
-    /* forward one batch */
-    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
-      MatrixPtr batchValue = batchValue_->getBatchValue(n);
-
-      if (n != 0) {
-        MatrixPtr preBatchValue =
-            batchValue_->getBatchValue(n - 1, batchValue->getHeight());
-
-        packed_weight_->gemm_compute(preBatchValue, batchValue);
-      }
-      Argument arg;
-      arg.value = batchValue;
-      activation_->forward(arg).check();
-    }
-  }
-  batchValue_->copyBackSeq(*output_.value);
-}
-
-void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
-                                            size_t numSequences,
-                                            const int* starts) {
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  size_t numBatch = batchGrad_->getNumBatch();
-  bool backwardByBatch = numBatch < numSequences;
-
-  batchGrad_->copyFromSeq(*output_.grad);
-  {
-    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
-    /* backward one batch */
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr batchGrad = batchGrad_->getBatchValue(n);
-      MatrixPtr batchValue =
-          batchValue_->getBatchValue(n, batchGrad->getHeight());
-
-      Argument arg;
-      arg.value = batchValue;
-      arg.grad = batchGrad;
-      activation_->backward(arg).check();
-
-      if (n != 0) {
-        batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
-        packed_weightT_->gemm_compute(batchGrad, batchValue);
-      }
-
-      if (backwardByBatch && weight_->getWGrad()) {
-        if (n != 0) {
-          /* backward weight */
-          batchValue =
-              batchValue_->getBatchValue(n - 1, batchGrad->getHeight());
-          weight_->getWGrad()->mul(
-              *batchValue->getTranspose(), *batchGrad, 1, 1);
-        }
-      }
-    }
-  }
-
-  batchGrad_->copyBackSeq(*output_.grad);
-
-  if (!backwardByBatch && weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
-    for (size_t seq = 0; seq < numSequences; ++seq) {
-      int len = starts[seq + 1] - starts[seq];
-      weight_->getWGrad()->mul(
-          *output_.value
-               ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1)
-               ->getTranspose(),
-          *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1,
-                                   len - 1),
-          1,
-          1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
deleted file mode 100644
index 441025a9c..000000000
--- a/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLPackedWeight.h"
-#include "RecurrentLayer.h"
-
-DECLARE_bool(rnn_use_batch);
-
-namespace paddle {
-
-/**
- * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
- * but is optimized with MKL cblas packed gemm.
- * More details:
- * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
- */
-
-class MKLPackedRecurrentLayer : public RecurrentLayer {
- public:
-  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
-      : RecurrentLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int* starts) override;
-
-  void backwardBatch(int batchSize,
-                     size_t numSequences,
-                     const int* starts) override;
-
- protected:
-  /// packed_weight_ contains same data with
-  /// RecurrentLayer::weight_ but is packed
-  std::unique_ptr<MKLPackedWeight> packed_weight_;
-  /// packed_weightT_ is the transposition matrix of packed_weight_
-  std::unique_ptr<MKLPackedWeight> packed_weightT_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLPackedWeight.h b/paddle/legacy/gserver/layers/MKLPackedWeight.h
deleted file mode 100644
index 47f225bd0..000000000
--- a/paddle/legacy/gserver/layers/MKLPackedWeight.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/MathFunctions.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/Weight.h"
-
-namespace paddle {
-
-class MKLPackedWeight {
- protected:
-  /// The pointer of weight
-  real *weight_;
-  /// The pointer of cblas packed gemm to weight
-  real *packedWeight_;
-  size_t height_;
-  size_t width_;
-  bool transW_;
-
- public:
-  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
-    packedWeight_ = nullptr;
-    weight_ = weight->getData();
-    height_ = weight->getHeight();
-    width_ = weight->getWidth();
-    transW_ = transW;
-  }
-
-  ~MKLPackedWeight() { free_(); }
-
-  void pack() { pack_(weight_); }
-
-  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
-    cblas_sgemm_compute(CblasRowMajor,
-                        CblasNoTrans,
-                        CblasPacked,
-                        src->getHeight(),
-                        transW_ ? height_ : width_,
-                        transW_ ? width_ : height_,
-                        src->getData(),
-                        src->getWidth(),
-                        packedWeight_,
-                        width_,
-                        1.0,
-                        dst->getData(),
-                        dst->getWidth());
-  }
-
- protected:
-  void pack_(real *src) {
-    if (!packedWeight_) {
-      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
-    }
-    cblas_sgemm_pack(CblasRowMajor,
-                     CblasBMatrix,
-                     transW_ ? CblasTrans : CblasNoTrans,
-                     1,
-                     transW_ ? height_ : width_,
-                     transW_ ? width_ : height_,
-                     1.0,
-                     src,
-                     width_,
-                     packedWeight_);
-  }
-
-  void free_() {
-    if (packedWeight_) {
-      cblas_sgemm_free(packedWeight_);
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxIdLayer.cpp b/paddle/legacy/gserver/layers/MaxIdLayer.cpp
deleted file mode 100644
index eecd4996e..000000000
--- a/paddle/legacy/gserver/layers/MaxIdLayer.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer for finding the id which has the maximal value for each sample.
- * The result is stored in output_.ids.
- *
- * The config file api is maxid_layer.
- */
-class MaxIdLayer : public Layer {
- private:
-  /// a predetermined number of best states at each level
-  size_t beamSize_;
-
- public:
-  explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-
-    beamSize_ = config_.has_beam_size() ? config_.beam_size() : FLAGS_beam_size;
-    CHECK_GE(beamSize_, 1LU);
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    const Argument& input = getInput(0);
-    size_t batchSize = input.getBatchSize();
-    IVector::resizeOrCreate(output_.ids, batchSize * beamSize_, useGpu_);
-    Matrix::resizeOrCreate(output_.in,
-                           batchSize,
-                           beamSize_,
-                           false,
-                           /* useGpu */ useGpu_);
-    output_.value = nullptr;
-    input.value->rowMax(*output_.ids, *output_.in);
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(maxid, MaxIdLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxLayer.cpp b/paddle/legacy/gserver/layers/MaxLayer.cpp
deleted file mode 100644
index b51251b66..000000000
--- a/paddle/legacy/gserver/layers/MaxLayer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(max, MaxLayer);
-
-void MaxLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  IVector::resizeOrCreate(
-      maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_));
-  maxIndex_->zeroMem();
-
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(
-        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
-  }
-
-  if (config_.output_max_index()) {
-    // copy maxIndex_ to output
-    outputValue->copyFrom(*maxIndex_);
-  } else {
-    /* add the bias-vector AFTER max operation */
-    if (biases_.get() != NULL) {
-      outputValue->addBias(*(biases_->getW()), 1);
-    }
-    /* activation */ { forwardActivation(); }
-  }
-}
-
-void MaxLayer::backward(const UpdateCallback& callback) {
-  CHECK(!config_.output_max_index())
-      << "backward is not available when output_max_index is set";
-  SequencePoolLayer::backward(callback);
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  if (inputGrad) {
-    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(
-        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxLayer.h b/paddle/legacy/gserver/layers/MaxLayer.h
deleted file mode 100644
index 12d0128e3..000000000
--- a/paddle/legacy/gserver/layers/MaxLayer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal max" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = max_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the max pooling operation is
- *              then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class MaxLayer : public SequencePoolLayer {
- protected:
-  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
-  IVectorPtr maxIndex_;
-
- public:
-  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    return SequencePoolLayer::init(layerMap, parameterMap);
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxOutLayer.cpp b/paddle/legacy/gserver/layers/MaxOutLayer.cpp
deleted file mode 100644
index 919f62a45..000000000
--- a/paddle/legacy/gserver/layers/MaxOutLayer.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxOutLayer.h"
-#include "hl_cnn.h"
-#include "hl_gpu.h"
-
-namespace paddle {
-
-REGISTER_LAYER(maxout, MaxOutLayer);
-
-size_t MaxOutLayer::getSize() {
-  const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf();
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = maxoutConf.image_conf().img_size_y();
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = maxoutConf.image_conf().img_size();
-  }
-
-  featLen_ = imgSizeH_ * imgSizeW_;
-  size_t layerSize = featLen_ * outputChannels_;
-
-  getOutput().setFrameHeight(imgSizeH_);
-  getOutput().setFrameWidth(imgSizeW_);
-
-  return layerSize;
-}
-
-bool MaxOutLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for maxout-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
-  groups_ = conf.groups();
-  channels_ = conf.image_conf().channels();
-  CHECK_EQ(channels_ % groups_, 0UL);
-  outputChannels_ = channels_ / groups_;
-
-  return true;
-}
-
-void MaxOutLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one column */
-  size_t batchSize = getInput(0).getBatchSize();
-  size_t size = getSize();
-  resetOutput(batchSize, size);
-  MatrixPtr inputV = getInputValue(0);
-  MatrixPtr outV = getOutputValue();
-
-  IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_);
-  outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_);
-}
-
-void MaxOutLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  /* Do derivation */
-  MatrixPtr inputG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-
-  if (inputG) {
-    inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxOutLayer.h b/paddle/legacy/gserver/layers/MaxOutLayer.h
deleted file mode 100644
index e56f34b8e..000000000
--- a/paddle/legacy/gserver/layers/MaxOutLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer to do max out on conv layer output.
- * Input: output of a conv layer.
- * Output: feature map size same as input.  Channel is (input channel) / groups.
- * So the num of channels should be able to devided by groups.
- *
- * The config file api is maxout_layer.
- */
-
-class MaxOutLayer : public Layer {
- protected:
-  size_t groups_;
-  size_t imgSizeH_, imgSizeW_;
-  /// outputChannels_ = channels_ / groups_
-  size_t channels_, outputChannels_;
-  /// feature length = imgSizeH_ * imgSizeW_
-  size_t featLen_;
-  IVectorPtr maxoutId_;
-
- public:
-  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
-  size_t getSize();
-
-  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
-  virtual ~MaxOutLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
deleted file mode 100644
index a1cc59a71..000000000
--- a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxPoolWithMaskLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  PoolLayer::init(layerMap, parameterMap);
-  setOutput("mask", &mask_);
-  return true;
-}
-
-size_t MaxPoolWithMaskLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-
-  outputY_ = outputSize(imgSizeY_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputX_ = outputSize(imgSize_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  layerSize = outputX_ * outputY_ * channels_;
-  getOutput().setFrameHeight(outputY_);
-  getOutput().setFrameWidth(outputX_);
-
-  return layerSize;
-}
-
-void MaxPoolWithMaskLayer::forward(PassType passType) {
-  size_t size = getSize();
-  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
-  int batchSize = inputV->getHeight();
-  resetOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-  CHECK_EQ(size, outV->getWidth());
-
-  resetSpecifyOutput(mask_,
-                     batchSize,
-                     size,
-                     /* isValueClean */ false,
-                     /* isGradClean */ true);
-
-  MatrixPtr maskV = mask_.value;
-  outV->maxPoolForward(*inputV,
-                       imgSizeY_,
-                       imgSize_,
-                       channels_,
-                       sizeX_,
-                       sizeY_,
-                       strideY_,
-                       stride_,
-                       outputY_,
-                       outputX_,
-                       confPaddingY_,
-                       confPadding_,
-                       maskV);
-}
-
-void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  MatrixPtr outGrad = getOutputGrad();
-  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
-
-  inputGrad->maxPoolBackward(*inputV,
-                             imgSizeY_,
-                             imgSize_,
-                             *outGrad,
-                             *outV,
-                             sizeX_,
-                             sizeY_,
-                             strideY_,
-                             stride_,
-                             outputY_,
-                             outputX_,
-                             1,
-                             1,
-                             confPaddingY_,
-                             confPadding_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
deleted file mode 100644
index fcd5388ab..000000000
--- a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class MaxPoolWithMaskLayer : public PoolLayer {
- protected:
-  Argument mask_;
-
- public:
-  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
-      : PoolLayer(config) {}
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MixedLayer.cpp b/paddle/legacy/gserver/layers/MixedLayer.cpp
deleted file mode 100644
index 63e658c09..000000000
--- a/paddle/legacy/gserver/layers/MixedLayer.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MixedLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(mixed, MixedLayer);
-
-bool MixedLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    if (config_.inputs(i).has_proj_conf()) {
-      projections_[i].reset(Projection::create(
-          config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
-    } else {
-      CHECK(!parameters_[i]) << "should no parameters for operators";
-    }
-  }
-  for (auto& operator_conf : config_.operator_confs()) {
-    for (auto& input_index : operator_conf.input_indices()) {
-      CHECK(!config_.inputs(input_index).has_proj_conf());
-    }
-    operators_.emplace_back(Operator::create(operator_conf, useGpu_));
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    sharedBias_ = config_.shared_biases();
-    size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
-  }
-
-  return true;
-}
-
-void MixedLayer::prefetch() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->prefetch(&getInput(i));
-    }
-  }
-}
-
-void MixedLayer::resetState() {
-  for (auto& proj : projections_) {
-    if (proj) {
-      proj->resetState();
-    }
-  }
-}
-
-void MixedLayer::setState(LayerStatePtr state) {
-  CHECK(projectionStateMatrixSize_.size() == projections_.size())
-      << "projection size mis-match";
-
-  int start = 0;
-  LayerStatePtr statePtr = std::make_shared<LayerState>();
-  for (int i = 0; i < (int)projectionStateMatrixSize_.size(); i++) {
-    if (projectionStateMatrixSize_[i] > 0) {
-      statePtr->value.clear();
-      for (int j = start; j < start + projectionStateMatrixSize_[i]; j++) {
-        statePtr->value.push_back(state->value[j]);
-      }
-      projections_[i]->setState(statePtr);
-      start += projectionStateMatrixSize_[i];
-    }
-  }
-  CHECK((int)state->value.size() == start) << "state matrix size mis-match";
-}
-
-// Return state which consists of all projections states
-LayerStatePtr MixedLayer::getState() {
-  bool init = projectionStateMatrixSize_.size() == 0;
-  LayerStatePtr res = std::make_shared<LayerState>();
-  for (int i = 0; i < (int)projections_.size(); i++) {
-    LayerStatePtr statePtr =
-        projections_[i] ? projections_[i]->getState() : nullptr;
-    int stateSize = statePtr == nullptr ? 0 : statePtr->value.size();
-    if (init) {
-      projectionStateMatrixSize_.push_back(stateSize);
-    } else {
-      CHECK(projectionStateMatrixSize_[i] == stateSize)
-          << "state matrix size mis-match";
-    }
-    if (statePtr != nullptr) {
-      for (auto& matrixPtr : statePtr->value) {
-        res->value.push_back(matrixPtr);
-      }
-    }
-  }
-  return res;
-}
-
-void MixedLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->forward(&getInput(i), &output_, passType);
-    }
-  }
-
-  std::vector<const Argument*> ins;
-  for (auto& op : operators_) {
-    ins.clear();
-    for (auto& input_index : op->getConfig().input_indices()) {
-      ins.push_back(&getInput(input_index));
-    }
-    op->forward(ins, &output_, passType);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1, sharedBias_);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MixedLayer::backward(const UpdateCallback& callback) {
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->backward(callback);
-    }
-  }
-
-  for (auto& op : operators_) {
-    op->backward();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MixedLayer.h b/paddle/legacy/gserver/layers/MixedLayer.h
deleted file mode 100644
index 43ee2bd81..000000000
--- a/paddle/legacy/gserver/layers/MixedLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "Operator.h"
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * A mixed layer has multiple input layers.
- * Each input layer was processed by a Projection or Operator.
- * The results of all projections or Operators are summed together with bias
- * (if configured), and then go through an activation function and dropout
- * (if configured).
- *
- * The config file api is mixed_layer.
- */
-class MixedLayer : public Layer {
- public:
-  explicit MixedLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~MixedLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void prefetch() override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  void resetState() override;
-  /**
-   * setState() should be called after getState().
-   * Argument state consists of all projections states.
-   */
-  void setState(LayerStatePtr state) override;
-  /**
-   * Return state which consists of all projections states.
-   */
-  LayerStatePtr getState() override;
-
- protected:
-  std::vector<std::unique_ptr<Projection>> projections_;
-  std::vector<std::unique_ptr<Operator>> operators_;
-  /// the matrix size of projection state
-  std::vector<int> projectionStateMatrixSize_;
-  std::unique_ptr<Weight> biases_;
-  bool sharedBias_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp b/paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
deleted file mode 100644
index 335e9a6ac..000000000
--- a/paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiBoxLossLayer.h"
-#include <float.h>
-#include <vector>
-#include "DataLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(multibox_loss, MultiBoxLossLayer);
-
-bool MultiBoxLossLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  auto layerConf = config_.inputs(0).multibox_loss_conf();
-  numClasses_ = layerConf.num_classes();
-  inputNum_ = layerConf.input_num();
-  overlapThreshold_ = layerConf.overlap_threshold();
-  negPosRatio_ = layerConf.neg_pos_ratio();
-  negOverlap_ = layerConf.neg_overlap();
-  backgroundId_ = layerConf.background_id();
-  return true;
-}
-
-void MultiBoxLossLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
-  resetOutput(batchSize, 1);
-
-  // all location data and confidence score data
-  locSizeSum_ = 0;
-  confSizeSum_ = 0;
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
-    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
-    locSizeSum_ += inLoc->getElementCnt();
-    confSizeSum_ += inConf->getElementCnt();
-  }
-
-  // locBuffer layout:
-  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ......
-  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
-  locBuffer_ = locTmpBuffer_;
-
-  // confBuffer layout:
-  // | class1 score | class2 score | ... |classN score | class1 score | ......
-  Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_);
-  confBuffer_ = confTmpBuffer_;
-
-  // concate location data and confidence score data
-  size_t locOffset = 0;
-  size_t confOffset = 0;
-  auto& layerConf = config_.inputs(0).multibox_loss_conf();
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
-    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
-    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
-    if (!height) height = layerConf.height();
-    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
-    if (!width) width = layerConf.width();
-    locOffset += appendWithPermute(*inLoc,
-                                   height,
-                                   width,
-                                   locSizeSum_,
-                                   locOffset,
-                                   batchSize,
-                                   *locBuffer_,
-                                   kNCHWToNHWC);
-    confOffset += appendWithPermute(*inConf,
-                                    height,
-                                    width,
-                                    confSizeSum_,
-                                    confOffset,
-                                    batchSize,
-                                    *confBuffer_,
-                                    kNCHWToNHWC);
-  }
-  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
-  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
-
-  // priorValue layout:
-  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var
-  // | xmin2 | ......
-  MatrixPtr priorValue;
-
-  // labelValue layout:
-  // | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ......
-  MatrixPtr labelValue;
-
-  // Copy data from GPU to CPU if use GPU
-  if (useGpu_) {
-    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
-    Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false);
-    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
-    Matrix::resizeOrCreate(
-        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
-    MatrixPtr labelTmpValue = getInputValue(*getLabelLayer());
-    Matrix::resizeOrCreate(labelCpuValue_,
-                           labelTmpValue->getHeight(),
-                           labelTmpValue->getWidth(),
-                           false,
-                           false);
-
-    locCpuBuffer_->copyFrom(*locTmpBuffer_);
-    confCpuBuffer_->copyFrom(*confTmpBuffer_);
-    priorCpuValue_->copyFrom(*priorTmpValue);
-    labelCpuValue_->copyFrom(*labelTmpValue);
-
-    locBuffer_ = locCpuBuffer_;
-    confBuffer_ = confCpuBuffer_;
-    priorValue = priorCpuValue_;
-    labelValue = labelCpuValue_;
-  } else {
-    priorValue = getInputValue(*getPriorBoxLayer());
-    labelValue = getInputValue(*getLabelLayer());
-  }
-
-  // Get max scores for each prior bbox. Used in negative mining
-  std::vector<std::vector<real>> allMaxConfScore;
-  numPriors_ = priorValue->getElementCnt() / 8;
-  getMaxConfidenceScores(confBuffer_->getData(),
-                         batchSize,
-                         numPriors_,
-                         numClasses_,
-                         backgroundId_,
-                         &allMaxConfScore);
-
-  // Match prior bbox to groundtruth bbox
-  Argument label = getInput(*getLabelLayer());
-  const int* labelIndex = label.sequenceStartPositions->getData(false);
-  size_t seqNum = label.getNumSequences();
-  numMatches_ = 0;
-  numNegs_ = 0;
-  allMatchIndices_.clear();
-  allNegIndices_.clear();
-
-  std::pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
-                                                           numPriors_,
-                                                           *labelValue,
-                                                           labelIndex,
-                                                           seqNum,
-                                                           allMaxConfScore,
-                                                           batchSize,
-                                                           overlapThreshold_,
-                                                           negOverlap_,
-                                                           negPosRatio_,
-                                                           &allMatchIndices_,
-                                                           &allNegIndices_);
-  numMatches_ = retPair.first;
-  numNegs_ = retPair.second;
-
-  // BBox location L1 smooth loss
-  locLoss_ = 0.0;
-  if (numMatches_ >= 1) {
-    size_t count = 0;
-    MatrixPtr locLossOutput;
-    Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false);
-    Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
-    Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
-    locDiff_->zeroMem();
-    std::vector<real> locGTData;
-
-    real* locDiffData = locDiff_->getData();
-    const real* locBufferData = locBuffer_->getData();
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (size_t i = 0; i < numPriors_; ++i) {
-        if (allMatchIndices_[n][i] == -1) continue;  // match none
-        size_t locOffset =
-            n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
-        std::copy(locBufferData + locOffset,
-                  locBufferData + locOffset + 4,
-                  locDiffData + count);
-        count += 4;
-        const int gtIdx = allMatchIndices_[n][i];
-        size_t priorOffset = i * 8;
-        std::vector<NormalizedBBox> priorBBoxVec;
-        getBBoxFromPriorData(
-            priorValue->getData() + priorOffset, 1, priorBBoxVec);
-        std::vector<std::vector<real>> priorBBoxVar;
-        getBBoxVarFromPriorData(
-            priorValue->getData() + priorOffset, 1, priorBBoxVar);
-        size_t labelOffset = (labelIndex[n] + gtIdx) * 6;
-        std::vector<NormalizedBBox> gtBBoxVec;
-        getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec);
-        std::vector<real> gtEncode;
-        encodeBBoxWithVar(
-            priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode);
-        locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
-      }
-    }
-    locGTData_->copyFrom(&locGTData[0], numMatches_ * 4);
-    locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0);
-    locLoss_ = locLossOutput->getSum() / numMatches_;
-  }
-
-  // BBox confidence softmax loss
-  confLoss_ = 0;
-  numConf_ = numMatches_ + numNegs_;
-  if (numConf_ >= 1) {
-    Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false);
-    IVector::resizeOrCreate(confGTData_, numConf_, false);
-    confProb_->zeroMem();
-    size_t count = 0;
-
-    std::vector<real> confPredData;
-    real* confProbData = confProb_->getData();
-    const real* confBufferData = confBuffer_->getData();
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (size_t i = 0; i < numPriors_; ++i) {
-        if (allMatchIndices_[n][i] == -1) continue;
-        size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6;
-        const int gtLabel = (labelValue->getData() + labelOffset)[0];
-        confGTData_->getData()[count] = gtLabel;
-        size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
-        std::copy(confBufferData + confOffset,
-                  confBufferData + confOffset + numClasses_,
-                  confProbData + count * numClasses_);
-        confPredData.reserve(confPredData.size() + numClasses_);
-        confPredData.insert(confPredData.end(),
-                            confBufferData + confOffset,
-                            confBufferData + confOffset + numClasses_);
-        ++count;
-      }
-      // Negative mining samples
-      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
-        confGTData_->getData()[count] = backgroundId_;
-        size_t confOffset =
-            n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
-        std::copy(confBufferData + confOffset,
-                  confBufferData + confOffset + numClasses_,
-                  confProbData + count * numClasses_);
-        confPredData.reserve(confPredData.size() + numClasses_);
-        confPredData.insert(confPredData.end(),
-                            confBufferData + confOffset,
-                            confBufferData + confOffset + numClasses_);
-        ++count;
-      }
-    }
-    CHECK_EQ(numConf_, count);
-    confProb_->softmax(*confProb_);
-    MatrixPtr confLossOutput;
-    Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
-    confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_);
-    confLoss_ = confLossOutput->getSum() / numMatches_;
-  }
-  real loss = locLoss_ + confLoss_;
-  MatrixPtr outV = getOutputValue();
-  outV->assign(loss);
-}
-
-void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
-  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
-  locBuffer_->zeroMem();
-  confBuffer_->zeroMem();
-
-  // Back propagate on location prediction
-  if (numMatches_ >= 1) {
-    MatrixPtr locDiffBuffer;
-    Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false);
-    locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0);
-    locDiff_->copyFrom(*locDiffBuffer);
-    // scale gradient
-    for (size_t i = 0; i < numMatches_ * 4; ++i)
-      locDiff_->getData()[i] *= (1. / numMatches_);
-    // Copy gradient back
-    size_t count = 0;
-    const real* locDiffData = locDiff_->getData();
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (size_t i = 0; i < numPriors_; ++i) {
-        if (allMatchIndices_[n][i] == -1) continue;
-        real* locBufferData =
-            locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
-        std::copy(locDiffData + count * 4,
-                  locDiffData + (count + 1) * 4,
-                  locBufferData);
-        ++count;
-      }
-    }
-    CHECK_EQ(count, numMatches_);
-  }
-
-  if (numConf_ >= 1) {
-    for (size_t i = 0; i < numConf_; ++i)
-      confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1;
-    for (size_t i = 0; i < numConf_ * numClasses_; ++i)
-      confProb_->getData()[i] *= (1. / numMatches_);
-    size_t count = 0;
-    const real* confProbData = confProb_->getData();
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (size_t i = 0; i < numPriors_; ++i) {
-        if (allMatchIndices_[n][i] == -1) continue;
-        real* confDiffData = confBuffer_->getData() +
-                             n * numPriors_ * numClasses_ + i * numClasses_;
-        std::copy(confProbData + count * numClasses_,
-                  confProbData + (count + 1) * numClasses_,
-                  confDiffData);
-        ++count;
-      }
-      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
-        int idx = allNegIndices_[n][i];
-        real* confDiffData = confBuffer_->getData() +
-                             n * numPriors_ * numClasses_ + idx * numClasses_;
-        std::copy(confProbData + count * numClasses_,
-                  confProbData + (count + 1) * numClasses_,
-                  confDiffData);
-        ++count;
-      }
-    }
-    CHECK_EQ(count, numConf_);
-  }
-  if (useGpu_) {
-    locTmpBuffer_->copyFrom(*locCpuBuffer_);
-    confTmpBuffer_->copyFrom(*confCpuBuffer_);
-    locBuffer_ = locTmpBuffer_;
-    confBuffer_ = confTmpBuffer_;
-  }
-  // copy back
-  size_t locOffset = 0;
-  size_t confOffset = 0;
-  auto layerConf = config_.inputs(0).multibox_loss_conf();
-  for (size_t n = 0; n < inputNum_; ++n) {
-    const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
-    const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
-    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
-    // only for unittest, there are no width and height information
-    // when constructing matrix in unittest, so we should
-    // set the shape in configuration
-    if (!height) height = layerConf.height();
-    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
-    if (!width) width = layerConf.width();
-
-    // NHWC to NCHW
-    MatrixPtr locGBuffer;
-    Matrix::resizeOrCreate(
-        locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_);
-    MatrixPtr confGBuffer;
-    Matrix::resizeOrCreate(
-        confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_);
-
-    locOffset += decomposeWithPermute(*locBuffer_,
-                                      height,
-                                      width,
-                                      locSizeSum_,
-                                      locOffset,
-                                      batchSize,
-                                      *locGBuffer,
-                                      kNHWCToNCHW);
-    inLocG->add(*locGBuffer);
-    confOffset += decomposeWithPermute(*confBuffer_,
-                                       height,
-                                       width,
-                                       confSizeSum_,
-                                       confOffset,
-                                       batchSize,
-                                       *confGBuffer,
-                                       kNHWCToNCHW);
-    inConfG->add(*confGBuffer);
-  }
-  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
-  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultiBoxLossLayer.h b/paddle/legacy/gserver/layers/MultiBoxLossLayer.h
deleted file mode 100644
index a358cded0..000000000
--- a/paddle/legacy/gserver/layers/MultiBoxLossLayer.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
-
-licensed under the apache license, version 2.0 (the "license");
-you may not use this file except in compliance with the license.
-you may obtain a copy of the license at
-
-    http://www.apache.org/licenses/license-2.0
-
-unless required by applicable law or agreed to in writing, software
-distributed under the license is distributed on an "as is" basis,
-without warranties or conditions of any kind, either express or implied.
-see the license for the specific language governing permissions and
-limitations under the license. */
-
-#pragma once
-
-#include <vector>
-#include "CostLayer.h"
-#include "DataLayer.h"
-#include "DetectionUtil.h"
-#include "Layer.h"
-
-using std::vector;
-using std::pair;
-
-namespace paddle {
-
-/**
- * The multibox loss layer for a SSD detection task.
- * The loss is composed by the location loss and the confidence loss.
- * The location loss is a smooth L1 loss and the confidence loss is
- * a softmax loss.
- * - Input: This layer needs four input layers: The first input layer
- *          is the priorbox layer and the second layer is a label layer.
- *          The rest two input layers are convolution layers for generating
- *          bbox location offset and the classification confidence.
- * - Output: The Single Shot Multibox Detection loss value.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-
-class MultiBoxLossLayer : public CostLayer {
- public:
-  explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
-
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
-
- protected:
-  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
-  inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
-  inline LayerPtr getLocInputLayer(size_t index) {
-    return inputLayers_[2 + index];
-  }
-  inline LayerPtr getConfInputLayer(size_t index) {
-    return inputLayers_[2 + inputNum_ + index];
-  }
-
- protected:
-  size_t numClasses_;
-  real overlapThreshold_;
-  real negPosRatio_;
-  real negOverlap_;
-  size_t inputNum_;
-  size_t backgroundId_;
-
-  real locLoss_;
-  real confLoss_;
-
-  size_t numPriors_;
-  size_t numMatches_;
-  size_t numNegs_;
-  size_t numConf_;
-  size_t locSizeSum_;
-  size_t confSizeSum_;
-
-  vector<vector<int>> allMatchIndices_;
-  vector<vector<int>> allNegIndices_;
-  MatrixPtr locGTData_;
-  IVectorPtr confGTData_;
-
-  MatrixPtr locBuffer_;
-  MatrixPtr confBuffer_;
-  MatrixPtr locDiff_;
-  MatrixPtr confProb_;
-
-  MatrixPtr labelCpuValue_;
-  MatrixPtr priorCpuValue_;
-  MatrixPtr locCpuBuffer_;
-  MatrixPtr confCpuBuffer_;
-  MatrixPtr locTmpBuffer_;
-  MatrixPtr confTmpBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultinomialSampler.cpp b/paddle/legacy/gserver/layers/MultinomialSampler.cpp
deleted file mode 100644
index e74ed795a..000000000
--- a/paddle/legacy/gserver/layers/MultinomialSampler.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultinomialSampler.h"
-
-namespace paddle {
-
-MultinomialSampler::MultinomialSampler(const real* prob, int size)
-    : rand_(0.0, size) {
-  intervals_.resize(size + 1);
-  double sum = 0;
-  for (int i = 0; i < size; ++i) {
-    sum += prob[i];
-  }
-
-  double intervalLength = sum / size;
-  double s = 1 / intervalLength;
-  for (int i = 0; i < size; ++i) {
-    intervals_[i] = {i, (real)(prob[i] * s)};
-  }
-
-  auto nextSmallPos = [&](int pos) {
-    while (pos < size &&
-           (pos != intervals_[pos].otherId || intervals_[pos].thresh >= 1)) {
-      ++pos;
-    }
-    return pos;
-  };
-
-  auto nextBigPos = [&](int pos) {
-    while (pos < size && intervals_[pos].thresh < 1) {
-      ++pos;
-    }
-    return pos;
-  };
-
-  int smallPos = nextSmallPos(0);
-  int bigPos = nextBigPos(0);
-
-  auto fillIntervals = [&]() {
-    while (bigPos < size) {
-      while (intervals_[bigPos].thresh > 1 && smallPos < size) {
-        intervals_[smallPos].otherId = bigPos;
-        intervals_[bigPos].thresh -= 1 - intervals_[smallPos].thresh;
-        smallPos = nextSmallPos(smallPos + 1);
-      }
-      if (smallPos >= size) break;
-      bigPos = nextBigPos(bigPos + 1);
-      // If intervals_[bigPos].thresh < 1, it becomes a small interval
-    }
-  };
-
-  fillIntervals();
-
-  smallPos = nextSmallPos(0);
-
-  // At this point there is no small intervals after bigPos. And this condition
-  // will remain true during the next fillIntervals()
-
-  fillIntervals();
-
-  // Handle the inaccuracy caused by finite-precision arithmetic which
-  // may results in some unprocessed small or big intervals at this point.
-  for (int i = 0; i < size; ++i) {
-    if (intervals_[i].otherId == i) {
-      intervals_[i].thresh = 1;
-    }
-  }
-
-  // The last one is to safeguard the case that the random number is equal
-  // to size
-  intervals_[size] = {size - 1, 1};
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultinomialSampler.h b/paddle/legacy/gserver/layers/MultinomialSampler.h
deleted file mode 100644
index ed4453524..000000000
--- a/paddle/legacy/gserver/layers/MultinomialSampler.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <random>
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-/**
- * @brief Given the probability of N objects, the sampler random select
- * one of the object.
- * @note: prob does not have to be unnormalized.
- *
- * The space requirement is O(N)=O(N * sizeof(Interval)).
- * The computational complexity of generate one sample is O(1).
- */
-class MultinomialSampler {
- public:
-  MultinomialSampler(const real* prob, int size);
-
-  //! protobuf always using double.
-  static MultinomialSampler* create(const double* prob, int size) {
-#ifdef PADDLE_TYPE_DOUBLE
-    return new MultinomialSampler(prob, size);
-#else
-    std::unique_ptr<real[]> tmp(new real[size]);
-    std::copy(prob, prob + size, tmp.get());
-    return new MultinomialSampler(tmp.get(), size);
-#endif
-  }
-
-  /**
-   * @brief Generate a random sample.
-   * @param g is a random number engine. See <random>.
-   * @return Random integer.
-   */
-  template <typename URNG>
-  int gen(URNG& g) {
-    return gen1([&g, this]() { return rand_(g); });
-  }
-
- protected:
-  /**
-   * @brief Generation
-   * @param[in] rand rand is a real random number distribution
-   * for the range [0, size).
-   * @return random int number or intervals_[random_int_number].otherId.
-   */
-  template <typename Rand>
-  int gen1(Rand rand) {
-    double r = rand();  // NOLINT
-    int i = (int)r;
-    r -= i;
-    return r < intervals_[i].thresh ? i : intervals_[i].otherId;
-  }
-
-  struct Interval {
-    int otherId;
-    real thresh;
-  };
-
-  /// The probability of each interval will be 1./size
-  std::vector<Interval> intervals_;
-  std::uniform_real_distribution<double> rand_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultiplexLayer.cpp b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
deleted file mode 100644
index 9ca2b2417..000000000
--- a/paddle/legacy/gserver/layers/MultiplexLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *@brief This layer multiplex multiple layers according to the index,
- * which is provided by the first input layer.
- * - Input[0]: the index of the layer to output of size batchSize.
- * - Input[1:N]; the candidate output data.
- * For each index i from 0 to batchSize -1, the output is the i-th row of the
- * (index[i] + 1)-th layer.
- *
- * For each i-th row of output:
- *
- * \f[
- *   y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
- * \f]
- * where, y is output. \f$x_{k}\f$ is the k-th input layer and
- * \f$k = x_{0}[i] + 1\f$.
- */
-
-class MultiplexLayer : public Layer {
- protected:
-  /**
-   * @brief A struct is used to save the copy information, includes input
-   * layer index and copy size.
-   */
-  struct CopyInfo {
-    CopyInfo(int inStartIdx, int inLength, int inCopyIdx)
-        : startIdx(inStartIdx), length(inLength), copyIdx(inCopyIdx) {}
-
-    /// The start row of input.
-    int startIdx;
-    /// Number of rows. If the layer index in Input[0] is not consecutive,
-    /// the length is one. Otherwise, the length is > 1 and copy multi rows
-    /// once.
-    int length;
-    /// The copied layer index, which needs to add 1.
-    int copyIdx;
-  };
-
-  /// A list of CopyInfo used to save copy information.
-  std::vector<CopyInfo> copySchedule_;
-
-  /// Temporary matrix pointer to point to input data.
-  MatrixPtr tmpSrc_;
-  /// Temporary matrix pointer to point to output data.
-  MatrixPtr tmpDest_;
-
- public:
-  explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~MultiplexLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /**
-   * @brief Calculate copy info for input layers.
-   */
-  void calculateCopySchedule(const IVectorPtr& copyIds, size_t numIns);
-};
-
-REGISTER_LAYER(multiplex, MultiplexLayer);
-
-void MultiplexLayer::calculateCopySchedule(const IVectorPtr& copyIds,
-                                           size_t numIns) {
-  copySchedule_.clear();
-  CopyInfo prevCopyInfo(0, 0, -1);
-  for (size_t i = 0; i < copyIds->getSize(); i++) {
-    int copyId = copyIds->getElement(i);
-    CHECK_GE(copyId, 0);
-    CHECK_LT(copyId, int(numIns));
-    // copy same input layer with prevous and will copy consecutive.
-    if (copyId == prevCopyInfo.copyIdx) {
-      ++prevCopyInfo.length;
-    } else {
-      if (prevCopyInfo.copyIdx != -1) {
-        copySchedule_.emplace_back(prevCopyInfo);
-      }
-      prevCopyInfo.startIdx = i;
-      prevCopyInfo.length = 1;
-      prevCopyInfo.copyIdx = copyId;
-    }
-  }
-  if (prevCopyInfo.copyIdx != -1) {
-    copySchedule_.emplace_back(prevCopyInfo);
-  }
-}
-
-bool MultiplexLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_GE(inputLayers_.size(), 2U);
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  return true;
-}
-
-void MultiplexLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  IVectorPtr copyIds = getInput(0).ids;
-  MatrixPtr inV1 = getInputValue(1);
-  CHECK_EQ(copyIds->getSize(), inV1->getHeight());
-  for (size_t i = 2; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inV1->getHeight(), getInputValue(i)->getHeight());
-    CHECK_EQ(inV1->getWidth(), getInputValue(i)->getWidth());
-  }
-
-  calculateCopySchedule(copyIds, inputLayers_.size() - 1);
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(inV1->getHeight(), inV1->getWidth());
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwLMultplexingTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      outV->subMatrix(info.startIdx, info.length, tmpDest_)
-          ->copyFrom(*getInputValue(info.copyIdx + 1)
-                          ->subMatrix(info.startIdx, info.length, tmpSrc_));
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MultiplexLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwLMultiplexTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      if (getInputGrad(info.copyIdx + 1)) {
-        getInputGrad(info.copyIdx + 1)
-            ->subMatrix(info.startIdx, info.length, tmpDest_)
-            ->add(*outG->subMatrix(info.startIdx, info.length, tmpSrc_));
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NCELayer.cpp b/paddle/legacy/gserver/layers/NCELayer.cpp
deleted file mode 100644
index ae4d64081..000000000
--- a/paddle/legacy/gserver/layers/NCELayer.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "Layer.h"
-#include "MultinomialSampler.h"
-#include "paddle/legacy/math/MathFunctions.h"
-
-namespace paddle {
-
-/**
- * Noise-contrastive estimation.
- * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language
- * models.
- *
- * The config file api is nce_layer.
- */
-class NCELayer : public Layer {
-  int numClasses_;
-  /// number of input layer besides labelLayer and weightLayer
-  int numInputs_;
-  LayerPtr labelLayer_;
-  /// weight layer, can be None
-  LayerPtr weightLayer_;
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-  std::unique_ptr<MultinomialSampler> sampler_;
-
-  std::uniform_int_distribution<int> rand_;
-
-  struct Sample {
-    int sampleId;
-    int labelId;
-    bool target;
-    real weight;
-  };
-  std::vector<Sample> samples_;
-  /// whether samples_ is prepared
-  bool prepared_;
-  Argument sampleOut_;
-
-  IVectorPtr labelIds_;
-
- public:
-  explicit NCELayer(const LayerConfig& config)
-      : Layer(config),
-        numClasses_(config.num_classes()),
-        rand_(0, config.num_classes() - 1),
-        prepared_(false) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    /* Initialize the basic parent class */
-    Layer::init(layerMap, parameterMap);
-
-    /* initialize the weightList */
-    size_t i;
-    for (i = 0; i < inputLayers_.size(); i++) {
-      if (!parameters_[i]) break;
-      size_t width = inputLayers_[i]->getSize();
-      // create a new weight
-      CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
-      Weight* w = new Weight(numClasses_, width, parameters_[i]);
-
-      // append the new weight to the list
-      weights_.emplace_back(w);
-    }
-
-    CHECK_EQ(1U, getSize());
-
-    numInputs_ = i;
-    CHECK_GE(numInputs_, 1)
-        << "Must have at least one input besides label and weight";
-    CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
-    labelLayer_ = inputLayers_[i];
-    if (++i < inputLayers_.size()) {
-      weightLayer_ = inputLayers_[i];
-      ++i;
-    }
-    CHECK_EQ(i, inputLayers_.size());
-
-    /* initialize biases_ */
-    if (biasParameter_.get() != NULL) {
-      CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
-      biases_.reset(new Weight(1, numClasses_, biasParameter_));
-    }
-
-    if (config_.neg_sampling_dist_size()) {
-      CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
-      sampler_.reset(MultinomialSampler::create(
-          config_.neg_sampling_dist().data(), numClasses_));
-    }
-
-    return true;
-  }
-
-  void prepareSamples() {
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    int batchSize = getInput(*labelLayer_).getBatchSize();
-    IVectorPtr label = getInput(*labelLayer_).ids;
-
-    CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
-        getInput(*labelLayer_).value);
-
-    CHECK(label || multiLabel)
-        << "The label layer must have ids or NonValueSparseMatrix value";
-
-    auto& randEngine = ThreadLocalRandomEngine::get();
-
-    samples_.clear();
-    samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
-
-    real* weight =
-        weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
-
-    for (int i = 0; i < batchSize; ++i) {
-      real w = weight ? weight[i] : 1;
-      if (label) {
-        int* ids = label->getData();
-        samples_.push_back({i, ids[i], true, w});
-      } else {
-        const int* cols = multiLabel->getRowCols(i);
-        int n = multiLabel->getColNum(i);
-        for (int j = 0; j < n; ++j) {
-          samples_.push_back({i, cols[j], true, w});
-        }
-      }
-      for (int j = 0; j < config_.num_neg_samples(); ++j) {
-        int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
-        samples_.push_back({i, id, false, w});
-      }
-    }
-    prepared_ = true;
-  }
-
-  void prefetch() override {
-    prepareSamples();
-    IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
-    int* ids = labelIds_->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      ids[i] = samples_[i].labelId;
-    }
-
-    for (int i = 0; i < numInputs_; ++i) {
-      auto sparseParam =
-          dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-      if (sparseParam) {
-        sparseParam->addRows(labelIds_);
-      }
-    }
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    if (!prepared_) {
-      if (passType == PASS_GC) {
-        ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
-      }
-      prepareSamples();
-    }
-    prepared_ = false;
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = getInputValue(0)->getHeight();
-    int size = getSize();
-    resetOutput(batchSize, size);
-
-    Matrix::resizeOrCreate(sampleOut_.value,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    forwardBias();
-
-    for (int l = 0; l < numInputs_; ++l) {
-      forwardOneInput(l);
-    }
-
-    auto status = activation_->forward(sampleOut_);
-    status.check();
-
-    forwardCost();
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    Matrix::resizeOrCreate(sampleOut_.grad,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    backwardCost();
-
-    auto status = activation_->backward(sampleOut_);
-    status.check();
-
-    if (biases_->getWGrad()) {
-      backwardBias(callback);
-    }
-
-    for (int l = 0; l < numInputs_; ++l) {
-      backwardOneInput(l, callback);
-    }
-  }
-
-  void forwardBias() {
-    if (!biases_) {
-      sampleOut_.value->zeroMem();
-    } else {
-      real* bias = biases_->getW()->getData();
-      real* sampleOut = sampleOut_.value->getData();
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        sampleOut[i] = bias[samples_[i].labelId];
-      }
-    }
-  }
-
-  void backwardBias(const UpdateCallback& callback) {
-    if (!biases_) return;
-    real* bias = biases_->getWGrad()->getData();
-    real* sampleOut = sampleOut_.grad->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      bias[samples_[i].labelId] += sampleOut[i];
-    }
-    biases_->incUpdate(callback);
-  }
-
-  void forwardOneInput(int layerId) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-
-    int dim = inputMat->getWidth();
-    real* sampleOut = sampleOut_.value->getData();
-
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      sampleOut[i] += dotProduct(dim,
-                                 inputMat->getRowBuf(samples_[i].sampleId),
-                                 weightMat->getRowBuf(samples_[i].labelId));
-    }
-  }
-
-  void backwardOneInput(int layerId, const UpdateCallback& callback) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& inputGradMat = getInputGrad(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-    const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
-
-    int dim = inputMat->getWidth();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    if (weightGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             inputMat->getRowBuf(samples_[i].sampleId),
-             weightGradMat->getRowBuf(samples_[i].labelId));
-      }
-      weights_[layerId]->incUpdate(callback);
-    }
-
-    if (inputGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             weightMat->getRowBuf(samples_[i].labelId),
-             inputGradMat->getRowBuf(samples_[i].sampleId));
-      }
-    }
-  }
-
-  void forwardCost() {
-    real* out = output_.value->getData();
-    real* sampleOut = sampleOut_.value->getData();
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
-      out[samples_[i].sampleId] += samples_[i].weight * cost;
-    }
-  }
-
-  void backwardCost() {
-    real* sampleOut = sampleOut_.value->getData();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real w = samples_[i].weight;
-      sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
-    }
-  }
-};
-
-REGISTER_LAYER(nce, NCELayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormLayer.cpp b/paddle/legacy/gserver/layers/NormLayer.cpp
deleted file mode 100644
index 443e26dbc..000000000
--- a/paddle/legacy/gserver/layers/NormLayer.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NormLayer.h"
-#include "NormProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-namespace paddle {
-
-REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
-
-Layer* NormLayer::create(const LayerConfig& config) {
-  CHECK_EQ(config.inputs_size(), 1);
-  const std::string& norm = config.inputs(0).norm_conf().norm_type();
-  if (norm == "rnorm") {
-    return new ResponseNormLayer(config);
-  } else if (norm == "cmrnorm-projection") {
-    return new CMRProjectionNormLayer(config);
-  } else if (norm == "cross-channel-norm") {
-    return new CrossChannelNormLayer(config);
-  } else {
-    LOG(FATAL) << "Unknown norm type: " << norm;
-    return nullptr;
-  }
-}
-
-bool ResponseNormLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  NormLayer::init(layerMap, parameterMap);
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  size_ = conf.size();
-  scale_ = conf.scale();
-  pow_ = conf.pow();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  denoms_ = NULL;
-
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormLayer.h b/paddle/legacy/gserver/layers/NormLayer.h
deleted file mode 100644
index 5ac00034d..000000000
--- a/paddle/legacy/gserver/layers/NormLayer.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of normalization
- *
- * @note Normalize the input in local region
- */
-class NormLayer : public Layer {
- public:
-  explicit NormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    Layer::init(layerMap, parameterMap);
-    return true;
-  }
-
-  /**
-   * @brief create norm layer by norm_type
-   */
-  static Layer* create(const LayerConfig& config);
-};
-
-/**
- * @brief response normalization within feature maps
- * namely normalize in independent channel
- * When code refactoring, we delete the original implementation.
- * Need to implement in the futrue.
- */
-class ResponseNormLayer : public NormLayer {
- protected:
-  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
-  real scale_, pow_;
-  MatrixPtr denoms_;
-
- public:
-  explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
-  void backward(const UpdateCallback& callback = nullptr) override {
-    LOG(FATAL) << "Not implemented";
-  }
-};
-
-/**
- * This layer applys normalization across the channels of each sample to a
- * conv layer's output, and scales the output by a group of trainable factors
- * whose dimensions equal to the number of channels.
- * - Input: One and only one input layer are accepted.
- * - Output: The normalized data of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-class CrossChannelNormLayer : public NormLayer {
- public:
-  explicit CrossChannelNormLayer(const LayerConfig& config)
-      : NormLayer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
-  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-
- protected:
-  size_t channels_;
-  std::unique_ptr<Weight> scale_;
-  MatrixPtr scaleDiff_;
-  MatrixPtr normBuffer_;
-  MatrixPtr dataBuffer_;
-  MatrixPtr channelBuffer_;
-  MatrixPtr spatialBuffer_;
-  MatrixPtr sampleBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.cpp b/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
deleted file mode 100644
index 72affaa1c..000000000
--- a/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NormProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-size_t CMRProjectionNormLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSizeY_;
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = imgSize_;
-  }
-  outputH_ = imgSizeH_;
-  outputW_ = imgSizeW_;
-  layerSize = outputH_ * outputW_ * channels_;
-
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-  return layerSize;
-}
-
-bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  ResponseNormLayer::init(layerMap, parameterMap);
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  createFunction(
-      forward_,
-      "CrossMapNormal",
-      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
-  createFunction(
-      backward_,
-      "CrossMapNormalGrad",
-      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
-
-  return true;
-}
-
-void CMRProjectionNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one row */
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
-
-  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-
-  // prepare forward arguments
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), shape_);
-  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
-  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
-
-  forward_[0]->calc(inputs, outputs);
-}
-
-void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  // prepare backward arguments
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), shape_);
-  inputs.addArg(*getOutputValue(), shape_);
-  inputs.addArg(*getOutputGrad(), shape_);
-  inputs.addArg(*denoms_, shape_);
-  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
-
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.h b/paddle/legacy/gserver/layers/NormProjectionLayer.h
deleted file mode 100644
index 492d1fcb7..000000000
--- a/paddle/legacy/gserver/layers/NormProjectionLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "NormLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief response normalization across feature maps
- * namely normalize in number of size_ channels
- */
-class CMRProjectionNormLayer : public ResponseNormLayer {
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-
- public:
-  explicit CMRProjectionNormLayer(const LayerConfig& config)
-      : ResponseNormLayer(config) {}
-
-  ~CMRProjectionNormLayer() {}
-
-  size_t getSize();
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  TensorShape shape_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Operator.cpp b/paddle/legacy/gserver/layers/Operator.cpp
deleted file mode 100644
index 5b9cf8d15..000000000
--- a/paddle/legacy/gserver/layers/Operator.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Operator.h"
-
-namespace paddle {
-
-ClassRegistrar<Operator, OperatorConfig, bool> Operator::registrar_;
-
-Operator* Operator::create(const OperatorConfig& config, bool useGpu) {
-  return registrar_.createByType(config.type(), config, useGpu);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Operator.h b/paddle/legacy/gserver/layers/Operator.h
deleted file mode 100644
index 20a248985..000000000
--- a/paddle/legacy/gserver/layers/Operator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/parameter/Parameter.h"
-
-#include "Layer.h"
-#include "paddle/legacy/parameter/Argument.h"
-
-namespace paddle {
-
-// Macro for registering a operator type
-// Example: REGISTER_OPERATOR(dot_mul, DotMulOperator);
-#define REGISTER_OPERATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {               \
-    Operator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-/**
- * Operator like Projection, but takes more than one Arguments as input.
- * @note: Operator can't have parameters.
- */
-class Operator {
- public:
-  static Operator* create(const OperatorConfig& config, bool useGpu);
-
-  Operator(const OperatorConfig& config, bool useGpu)
-      : config_(config), useGpu_(useGpu) {}
-
-  virtual ~Operator() {}
-
-  const OperatorConfig& getConfig() const { return config_; }
-
-  static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param ins inputs of operator
-   * @param out output of operator
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(std::vector<const Argument*> ins,
-               Argument* out,
-               PassType passType) {
-    ins_ = ins;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward() = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Set layer state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
- protected:
-  /// Config of operator
-  OperatorConfig config_;
-  bool useGpu_;
-
-  /// Store `ins` passed to forward()
-  std::vector<const Argument*> ins_;
-  /// Store `out` passed to forward()
-  Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/OuterProdLayer.cpp b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
deleted file mode 100644
index d0928be9d..000000000
--- a/paddle/legacy/gserver/layers/OuterProdLayer.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the outer product of two vectors
- * @note used in NEURAL TURING MACHINE
- * Input1: vector (batchSize * dim1)
- * Input2: vector (batchSize * dim2)
- * Output: a matrix: (batchSize * (dim1*dim2))
- */
-
-class OuterProdLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-
- public:
-  explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~OuterProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(out_prod, OuterProdLayer);
-
-bool OuterProdLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dim0 = inputLayers_[0]->getSize();
-  size_t dim1 = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(
-      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(
-      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ dim0,
-                           dim1,
-                           /* trans= */ false,
-                           useGpu_);
-  return true;
-}
-
-void OuterProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  CHECK_EQ(dim0 * dim1, getSize());
-  CHECK_EQ(inV1->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dim0 * dim1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwOutProdTimer", getName().c_str());
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpMtx0->setData(outV->getData() + i * dim0 * dim1);
-      tmpRow0->setData(inV0->getData() + i * dim0);
-      tmpRow1->setData(inV1->getData() + i * dim1);
-
-      tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1);
-    }
-  }
-}
-
-void OuterProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwOutProdTimer", getName().c_str());
-
-    if (inG0) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inG0->getData() + i * dim0);
-        tmpRow1->setData(inV1->getData() + i * dim1);
-
-        tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1);
-      }
-    }
-
-    if (inG1) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inV0->getData() + i * dim0);
-        tmpRow1->setData(inG1->getData() + i * dim1);
-
-        tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PadLayer.cpp b/paddle/legacy/gserver/layers/PadLayer.cpp
deleted file mode 100644
index 7b92b3de2..000000000
--- a/paddle/legacy/gserver/layers/PadLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(pad, PadLayer);
-
-bool PadLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  auto& pad_conf = config_.inputs(0).pad_conf();
-  auto& img_conf = pad_conf.image_conf();
-  CHECK_EQ(config_.inputs_size(), 1);
-  inDims_ = TensorShape(
-      {0,
-       img_conf.channels(),
-       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
-       img_conf.img_size()});
-
-  CHECK_EQ(2, pad_conf.pad_c_size());
-  CHECK_EQ(2, pad_conf.pad_h_size());
-  CHECK_EQ(2, pad_conf.pad_w_size());
-  padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
-  padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
-  padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
-
-  outDims_ = TensorShape(4);
-  setOutDims(0);
-
-  createFunction(forward_,
-                 "Pad",
-                 FuncConfig()
-                     .set("channel", padc_)
-                     .set("height", padh_)
-                     .set("width", padw_));
-  createFunction(backward_,
-                 "PadGrad",
-                 FuncConfig()
-                     .set("channel", padc_)
-                     .set("height", padh_)
-                     .set("width", padw_));
-
-  return true;
-}
-
-void PadLayer::setOutDims(const size_t batchSize) {
-  outDims_.reshape({batchSize,
-                    inDims_[1] + padc_[0] + padc_[1],
-                    inDims_[2] + padh_[0] + padh_[1],
-                    inDims_[3] + padw_[0] + padw_[1]});
-}
-
-void PadLayer::setTensorDim(const size_t batchSize) {
-  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
-  inDims_.setDim(0, batchSize);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-  setOutDims(batchSize);
-}
-
-void PadLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  setTensorDim(batchSize);
-  int size = outDims_[1] * outDims_[2] * outDims_[3];
-  resetOutput(batchSize, size);
-  MatrixPtr outV = getOutputValue();
-  REGISTER_TIMER_INFO("PadForward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-}
-
-void PadLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PadLayer.h b/paddle/legacy/gserver/layers/PadLayer.h
deleted file mode 100644
index 46b8a5959..000000000
--- a/paddle/legacy/gserver/layers/PadLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  This layer pads zeros to inputs according to the specify dimension.
- *         The input and output is a 4D tensor. Padding zeros from the 2nd to
- *         the 4th dimenstion according padc_, padh_ and padw_.
- */
-class PadLayer : public Layer {
- public:
-  explicit PadLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~PadLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  void setOutDims(const size_t batchSize);
-  void setTensorDim(const size_t batchSize);
-
-  std::vector<uint32_t> padc_;
-  std::vector<uint32_t> padh_;
-  std::vector<uint32_t> padw_;
-  TensorShape inDims_;
-  TensorShape outDims_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.cpp b/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
deleted file mode 100644
index 23715d197..000000000
--- a/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterReluLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(prelu, ParameterReluLayer);
-
-bool ParameterReluLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  partialSum_ = config_.partial_sum();
-  CHECK_GT(partialSum_, 0UL) << "partial_sum must be larger than zero.";
-  CHECK(!(inputLayers_[0]->getSize() % partialSum_))
-      << "Incorrect value for partialSum: " << partialSum_
-      << " must divide input size: " << inputLayers_[0]->getSize();
-  CHECK_EQ(getSize() / partialSum_, parameters_[0]->getSize());
-  weight_ = std::unique_ptr<Weight>(new Weight(
-      1UL, inputLayers_[0]->getSize() / partialSum_, parameters_[0]));
-  return true;
-}
-
-void ParameterReluLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    outV->paramReluForward(*(getInput(0).value), *(weight_->getW()));
-  }
-}
-
-void ParameterReluLayer::backward(const UpdateCallback& callback) {
-  if (weight_->getWGrad()) {
-    weight_->getWGrad()->paramReluBackwardW(*getOutputGrad(),
-                                            *(getInputValue(0)));
-  }
-
-  MatrixPtr preGrad = getInputGrad(0);
-  preGrad->paramReluBackwardDiff(
-      *getOutputGrad(), *(getInputValue(0)), *(weight_->getW()));
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.h b/paddle/legacy/gserver/layers/ParameterReluLayer.h
deleted file mode 100644
index 3aac4b42f..000000000
--- a/paddle/legacy/gserver/layers/ParameterReluLayer.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- *  @brief ParameterReluLayer active inputs with learnable parameter weight_.
- *  forward:
- *  \f[
- *      y = x > 0 ? x : w .* x
- *  \f]
- *  backward:
- *  \f[
- *      dx = x > 0 ? dy : w .* dy \\
- *      dw = x > 0 ? 0 : dy.*x
- *  \f]
- *  Here, x is the input, w is the weight, y is the output.
- *  dx, dw, dy is the gradient.
- */
-
-class ParameterReluLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> weight_;
-
-  /**
-   *  @brief partialSum_ makes a group of inputs share same weights,
-   *  - partialSum_ = 1:
-   *       element wise activation: each element has a weight_,
-   *  - partialSum_ = number of elements in one channel,
-   *       channels wise parameter activation, elements in a channel
-   *       share same weight_,
-   *  - partialSum_ = number of outputs
-   *       all elements share same weight_,
-   */
-  size_t partialSum_;
-
- public:
-  explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ParameterReluLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.cpp b/paddle/legacy/gserver/layers/Pool3DLayer.cpp
deleted file mode 100644
index ae3f55c27..000000000
--- a/paddle/legacy/gserver/layers/Pool3DLayer.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Pool3DLayer.h"
-#include "PoolProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-REGISTER_LAYER(pool3d, Pool3DLayer);
-
-bool Pool3DLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-
-  sizeX_ = conf.size_x();
-  sizeY_ = conf.size_y();
-  sizeZ_ = conf.size_z();
-
-  strideW_ = conf.stride();
-  strideH_ = conf.stride_y();
-  strideD_ = conf.stride_z();
-
-  imgSizeW_ = conf.img_size();
-  imgSizeH_ = conf.img_size_y();
-  imgSizeD_ = conf.img_size_z();
-
-  paddingW_ = conf.padding();
-  paddingH_ = conf.padding_y();
-  paddingD_ = conf.padding_z();
-
-  outputW_ = conf.output_x();
-  outputH_ = conf.output_y();
-  outputD_ = conf.output_z();
-
-  return true;
-}
-
-size_t Pool3DLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-
-  size_t layerSize = 0;
-  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
-  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
-  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
-
-  layerSize = outputD_ * outputH_ * outputW_ * channels_;
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-  getOutput().setFrameDepth(outputD_);
-  return layerSize;
-}
-
-void Pool3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  size_t batchSize = inMat->getHeight();
-  size_t outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
-  const MatrixPtr outMat = getOutputValue();
-
-  if (poolType_ == "avg") {
-    outMat->avgPool3DForward(*inMat,
-                             channels_,
-                             imgSizeD_,
-                             imgSizeH_,
-                             imgSizeW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
-                             sizeZ_,
-                             sizeY_,
-                             sizeX_,
-                             strideD_,
-                             strideH_,
-                             strideW_,
-                             paddingD_,
-                             paddingH_,
-                             paddingW_);
-  } else if (poolType_ == "max") {
-    outMat->maxPool3DForward(*inMat,
-                             *maxPoolIdx_,
-                             channels_,
-                             imgSizeD_,
-                             imgSizeH_,
-                             imgSizeW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
-                             sizeZ_,
-                             sizeY_,
-                             sizeX_,
-                             strideD_,
-                             strideH_,
-                             strideW_,
-                             paddingD_,
-                             paddingH_,
-                             paddingW_);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << poolType_;
-  }
-  forwardActivation();
-}
-
-void Pool3DLayer::backward(const UpdateCallback& callback) {
-  backwardActivation();
-
-  (void)callback;
-  if (NULL == getInputGrad(0)) return;
-  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
-  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr outGradMat = getOutputGrad();
-
-  if (poolType_ == "avg") {
-    inGradMat->avgPool3DBackward(*outGradMat,
-                                 imgSizeD_,
-                                 imgSizeH_,
-                                 imgSizeW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 sizeZ_,
-                                 sizeY_,
-                                 sizeZ_,
-                                 strideD_,
-                                 strideH_,
-                                 strideW_,
-                                 paddingD_,
-                                 paddingH_,
-                                 paddingW_,
-                                 1.0,
-                                 1.0);
-  } else if (poolType_ == "max") {
-    inGradMat->maxPool3DBackward(*outGradMat,
-                                 *maxPoolIdx_,
-                                 imgSizeD_,
-                                 imgSizeH_,
-                                 imgSizeW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 sizeZ_,
-                                 sizeY_,
-                                 sizeZ_,
-                                 strideD_,
-                                 strideH_,
-                                 strideW_,
-                                 paddingD_,
-                                 paddingH_,
-                                 paddingW_,
-                                 1.0,
-                                 1.0);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << poolType_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.h b/paddle/legacy/gserver/layers/Pool3DLayer.h
deleted file mode 100644
index 6851c44ab..000000000
--- a/paddle/legacy/gserver/layers/Pool3DLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class Pool3DLayer : public Layer {
- public:
-  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
-  ~Pool3DLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  size_t getSize();
-
- protected:
-  int channels_;
-  int sizeX_, sizeY_, sizeZ_;
-  int strideW_, strideH_, strideD_;
-  int paddingW_, paddingH_, paddingD_;
-  int imgSizeW_, imgSizeH_, imgSizeD_;
-  int outputW_, outputH_, outputD_;
-  std::string poolType_;
-  MatrixPtr maxPoolIdx_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolLayer.cpp b/paddle/legacy/gserver/layers/PoolLayer.cpp
deleted file mode 100644
index df172d957..000000000
--- a/paddle/legacy/gserver/layers/PoolLayer.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolLayer.h"
-#include "MaxPoolWithMaskLayer.h"
-#include "PoolProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#ifdef PADDLE_WITH_CUDA
-#include "CudnnPoolLayer.h"
-#endif
-namespace paddle {
-
-REGISTER_LAYER_CREATE_FUNC(pool, &PoolLayer::create);
-
-bool PoolLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-  sizeX_ = conf.size_x();
-  stride_ = conf.stride();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  confPadding_ = conf.padding();
-
-  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
-  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-
-  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
-  return true;
-}
-
-Layer* PoolLayer::create(const LayerConfig& config) {
-  CHECK_EQ(config.inputs_size(), 1);
-  const std::string& pool = config.inputs(0).pool_conf().pool_type();
-  if (pool == "max-projection" || pool == "avg-projection") {
-    return new PoolProjectionLayer(config);
-#ifdef PADDLE_WITH_CUDA
-  } else if (CudnnPoolLayer::typeCheck(pool)) {
-    return new CudnnPoolLayer(config);
-#endif
-  } else if (pool == "max-pool-with-mask") {
-    return new MaxPoolWithMaskLayer(config);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << pool;
-    return nullptr;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolLayer.h b/paddle/legacy/gserver/layers/PoolLayer.h
deleted file mode 100644
index 0808dfae8..000000000
--- a/paddle/legacy/gserver/layers/PoolLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class PoolLayer : public Layer {
- protected:
-  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
-  int confPadding_;
-
-  size_t sizeY_;
-  size_t imgSizeY_;
-  size_t strideY_;
-  size_t outputY_;
-  int confPaddingY_;
-
-  std::string poolType_;
-
-  bool excludeMode_;
-
- public:
-  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  /**
-   * @brief create pooling layer by pool_type
-   */
-  static Layer* create(const LayerConfig& config);
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjection.cpp b/paddle/legacy/gserver/layers/PoolProjection.cpp
deleted file mode 100644
index 73ce88adf..000000000
--- a/paddle/legacy/gserver/layers/PoolProjection.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolProjection.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION_CREATE_FUNC(pool, &PoolProjection::create);
-
-PoolProjection::PoolProjection(const ProjectionConfig& config,
-                               ParameterPtr parameter,
-                               bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  const PoolConfig& conf = config_.pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-  sizeX_ = conf.size_x();
-  stride_ = conf.stride();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  confPadding_ = conf.padding();
-
-  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
-  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-
-  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
-}
-
-size_t PoolProjection::getSize() {
-  imgSizeY_ = in_->getFrameHeight();
-  imgSize_ = in_->getFrameWidth();
-  const PoolConfig& conf = config_.pool_conf();
-  if (imgSizeY_ == 0) {
-    imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  }
-  if (imgSize_ == 0) {
-    imgSize_ = conf.img_size();
-  }
-  outputY_ = outputSize(imgSizeY_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputX_ = outputSize(imgSize_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  const_cast<Argument*>(out_)->setFrameHeight(outputY_);
-  const_cast<Argument*>(out_)->setFrameWidth(outputX_);
-
-  return outputY_ * outputX_ * channels_;
-}
-
-PoolProjection* PoolProjection::create(const ProjectionConfig& config,
-                                       ParameterPtr parameter,
-                                       bool useGpu) {
-  const std::string& pool = config.pool_conf().pool_type();
-  if (pool == "max-projection") {
-    return new MaxPoolProjection(config, parameter, useGpu);
-  } else if (pool == "avg-projection") {
-    return new AvgPoolProjection(config, parameter, useGpu);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << pool;
-    return nullptr;
-  }
-}
-
-void MaxPoolProjection::forward() {
-  size_t width = getSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  MatrixPtr inputV = in_->value;
-  MatrixPtr outV = out_->value;
-  outV->maxPoolForward(*inputV,
-                       imgSizeY_,
-                       imgSize_,
-                       channels_,
-                       sizeX_,
-                       sizeY_,
-                       strideY_,
-                       stride_,
-                       outputY_,
-                       outputX_,
-                       confPaddingY_,
-                       confPadding_);
-}
-
-void MaxPoolProjection::backward(const UpdateCallback& callback) {
-  (void)callback;
-  MatrixPtr outGrad = out_->grad;
-  MatrixPtr inputV = in_->value;
-  MatrixPtr outV = out_->value;
-  MatrixPtr inputGrad = in_->grad;
-
-  if (NULL == inputGrad) {
-    return;
-  }
-  inputGrad->maxPoolBackward(*inputV,
-                             imgSizeY_,
-                             imgSize_,
-                             *outGrad,
-                             *outV,
-                             sizeX_,
-                             sizeY_,
-                             strideY_,
-                             stride_,
-                             outputY_,
-                             outputX_,
-                             1,
-                             1,
-                             confPaddingY_,
-                             confPadding_);
-}
-
-void AvgPoolProjection::forward() {
-  size_t width = getSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  MatrixPtr inputV = in_->value;
-  MatrixPtr outV = out_->value;
-  outV->avgPoolForward(*inputV,
-                       imgSizeY_,
-                       imgSize_,
-                       channels_,
-                       sizeX_,
-                       sizeY_,
-                       strideY_,
-                       stride_,
-                       outputY_,
-                       outputX_,
-                       confPaddingY_,
-                       confPadding_,
-                       excludeMode_);
-}
-
-void AvgPoolProjection::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr outputGrad = out_->grad;
-  MatrixPtr inputGrad = in_->grad;
-
-  if (NULL == inputGrad) {
-    return;
-  }
-
-  inputGrad->avgPoolBackward(*outputGrad,
-                             imgSizeY_,
-                             imgSize_,
-                             sizeX_,
-                             sizeY_,
-                             strideY_,
-                             stride_,
-                             outputY_,
-                             outputX_,
-                             1,
-                             1,
-                             confPaddingY_,
-                             confPadding_,
-                             excludeMode_);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjection.h b/paddle/legacy/gserver/layers/PoolProjection.h
deleted file mode 100644
index d01b6a13f..000000000
--- a/paddle/legacy/gserver/layers/PoolProjection.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-namespace paddle {
-
-class PoolProjection : public Projection {
- protected:
-  size_t imgSizeY_, imgSize_;
-  size_t outputY_, outputX_;
-  size_t strideY_, stride_;
-  size_t sizeY_, sizeX_;
-  int confPaddingY_, confPadding_;
-  size_t channels_;
-  std::string poolType_;
-  bool excludeMode_;
-
- public:
-  PoolProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu);
-
-  static PoolProjection* create(const ProjectionConfig& config,
-                                ParameterPtr parameter,
-                                bool useGpu);
-
-  const std::string& getPoolType() const { return poolType_; }
-
-  size_t getSize();
-};
-
-class MaxPoolProjection : public PoolProjection {
- public:
-  MaxPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-
-class AvgPoolProjection : public PoolProjection {
- public:
-  AvgPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp b/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
deleted file mode 100644
index e44b1d7ba..000000000
--- a/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolProjectionLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-size_t PoolProjectionLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSizeY_;
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = imgSize_;
-  }
-
-  outputH_ = outputSize(imgSizeH_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputW_ = outputSize(imgSizeW_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  layerSize = outputH_ * outputW_ * channels_;
-
-  return layerSize;
-}
-
-void PoolProjectionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const Argument& in = getInput(0);
-  int batchSize = in.value->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-  poolProjection_->forward(&in, &output_, passType);
-}
-
-void PoolProjectionLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-  poolProjection_->backward(callback);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.h b/paddle/legacy/gserver/layers/PoolProjectionLayer.h
deleted file mode 100644
index fcd35bbba..000000000
--- a/paddle/legacy/gserver/layers/PoolProjectionLayer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "PoolProjection.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class PoolProjectionLayer : public PoolLayer {
- protected:
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-  std::unique_ptr<PoolProjection> poolProjection_;
-  ProjectionConfig projectionConfig_;
-
- public:
-  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
-    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
-    *conf = config_.inputs(0).pool_conf();
-    poolProjection_.reset(
-        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
-  }
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PowerLayer.cpp b/paddle/legacy/gserver/layers/PowerLayer.cpp
deleted file mode 100644
index 5e94c64db..000000000
--- a/paddle/legacy/gserver/layers/PowerLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * This layer applys a power function to a vector element-wise,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y = x^w
- * \f]
- * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
- * and output \f$y\f$ is a vector.
- *
- * The config file api is power_layer.
- */
-
-class PowerLayer : public Layer {
- protected:
-  MatrixPtr tmpMtx;
-
- public:
-  explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~PowerLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(power, PowerLayer);
-
-bool PowerLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void PowerLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(getSize(), dataDim);
-  CHECK_EQ(1U, inV0->getWidth());
-  CHECK_EQ(batchSize, inV0->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwPowerTimer", getName().c_str());
-    outV->rowPow(0, *inV1, *inV0);
-  }
-}
-
-void PowerLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwPowerTimer", getName().c_str());
-    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
-
-    if (inG0) {
-      tmpMtx->log2(*inV1);
-      tmpMtx->dotMul(*tmpMtx, *outV);
-
-      // inG0 += outG .* (log(inV1) * outV)
-      inG0->rowDotMul(0, *outG, *tmpMtx);
-    }
-
-    if (inG1) {
-      // tmp = (outV / inV1) * inV0
-      tmpMtx->dotDiv(*outV, *inV1);
-      tmpMtx->rowScale(0, *tmpMtx, *inV0);
-
-      inG1->addDotMul(*outG, *tmpMtx, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PrintLayer.cpp b/paddle/legacy/gserver/layers/PrintLayer.cpp
deleted file mode 100644
index 6fbcc447f..000000000
--- a/paddle/legacy/gserver/layers/PrintLayer.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-class PrintLayer : public Layer {
- public:
-  explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    std::vector<std::string> vals;
-    for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      std::ostringstream s;
-      getInput(i).printValueString(s, "");
-      vals.push_back(s.str());
-    }
-    size_t pos = 0;
-    size_t i = 0;
-    std::ostringstream s;
-    const std::string& format = config_.user_arg();
-    while (true) {
-      size_t pos1 = format.find("%s", pos);
-      if (pos1 == std::string::npos) break;
-      if (i >= vals.size()) {
-        break;
-      }
-      s << format.substr(pos, pos1 - pos) << vals[i];
-      pos = pos1 + 2;
-      ++i;
-    }
-    if (i != inputLayers_.size()) {
-      LOG(ERROR) << "Number of value in the format (" << format
-                 << ") is not same as the number of inputs ("
-                 << inputLayers_.size() << ") at " << getName();
-    }
-    s << format.substr(pos);
-
-    const std::string delimiter("\n");
-    std::string content = s.str();
-    std::string::size_type foundPos = 0;
-    std::string::size_type prevPos = 0;
-    while ((foundPos = content.find(delimiter, prevPos)) != std::string::npos) {
-      LOG(INFO) << content.substr(prevPos, foundPos - prevPos);
-      prevPos = foundPos + delimiter.size();
-    }
-    LOG(INFO) << content.substr(prevPos);
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(print, PrintLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PriorBox.cpp b/paddle/legacy/gserver/layers/PriorBox.cpp
deleted file mode 100644
index 83aab6e36..000000000
--- a/paddle/legacy/gserver/layers/PriorBox.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for generating priorbox locations and variances.
- * - Input: Two and only two input layer are accepted. The input layer must be
- *          be a data output layer and a convolution output layer.
- * - Output: The priorbox locations and variances of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-
-class PriorBoxLayer : public Layer {
- public:  // NOLINT
-  explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override {}
-
- protected:  // NOLINT
-  int numPriors_;
-  std::vector<int> minSize_;
-  std::vector<int> maxSize_;
-  std::vector<real> aspectRatio_;
-  std::vector<real> variance_;
-  MatrixPtr buffer_;
-};
-
-REGISTER_LAYER(priorbox, PriorBoxLayer);
-
-bool PriorBoxLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  auto pbConf = config_.inputs(0).priorbox_conf();
-  std::vector<real> tmp;
-  aspectRatio_.push_back(1.);
-  std::copy(pbConf.min_size().begin(),
-            pbConf.min_size().end(),
-            std::back_inserter(minSize_));
-  std::copy(pbConf.max_size().begin(),
-            pbConf.max_size().end(),
-            std::back_inserter(maxSize_));
-  std::copy(pbConf.variance().begin(),
-            pbConf.variance().end(),
-            std::back_inserter(variance_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(tmp));
-
-  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
-
-  // flip aspect ratios
-  for (unsigned index = 0; index < tmp.size(); index++) {
-    real ar = tmp[index];
-    if (fabs(ar - 1.) < 1e-6) continue;
-    aspectRatio_.push_back(ar);
-    aspectRatio_.push_back(1. / ar);
-  }
-
-  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
-
-  return true;
-}
-
-void PriorBoxLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  auto input = getInput(0);
-  int layerWidth = input.getFrameWidth();
-  int layerHeight = input.getFrameHeight();
-
-  auto image = getInput(1);
-  int imageWidth = image.getFrameWidth();
-  int imageHeight = image.getFrameHeight();
-
-  real stepW = static_cast<real>(imageWidth) / layerWidth;
-  real stepH = static_cast<real>(imageHeight) / layerHeight;
-  int dim = layerHeight * layerWidth * numPriors_ * 4;
-  reserveOutput(1, dim * 2);
-  // use a cpu buffer to compute
-  Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false);
-  auto* tmpPtr = buffer_->getData();
-
-  int idx = 0;
-  for (int h = 0; h < layerHeight; ++h) {
-    for (int w = 0; w < layerWidth; ++w) {
-      real centerX = (w + 0.5) * stepW;
-      real centerY = (h + 0.5) * stepH;
-      for (size_t s = 0; s < minSize_.size(); s++) {
-        real minSize = minSize_[s];
-        real boxWidth = minSize;
-        real boxHeight = minSize;
-
-        // first prior: aspect_ratio == 1.0, compatible to old logic
-        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-        // set the variance.
-        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-
-        if (maxSize_.size() > 0) {
-          // square prior with size sqrt(minSize * maxSize)
-          real maxSize = maxSize_[s];
-          boxWidth = boxHeight = sqrt(minSize * maxSize);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-
-        // priors with different aspect ratios
-        for (size_t r = 0; r < aspectRatio_.size(); r++) {
-          real ar = aspectRatio_[r];
-          if (fabs(ar - 1.0) < 1e-6) {
-            continue;
-          }
-          boxWidth = minSize * sqrt(ar);
-          boxHeight = minSize / sqrt(ar);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-      }
-    }
-  }
-
-  // clip the prior's coordidate such that it is within [0, 1]
-  for (int d = 0; d < dim * 2; ++d)
-    if ((d % 8) < 4)
-      tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.);
-  MatrixPtr outV = getOutputValue();
-  outV->copyFrom(buffer_->data_, dim * 2);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Projection.cpp b/paddle/legacy/gserver/layers/Projection.cpp
deleted file mode 100644
index 96d61e7f6..000000000
--- a/paddle/legacy/gserver/layers/Projection.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-#include "ContextProjection.h"
-#include "FullMatrixProjection.h"
-#include "TableProjection.h"
-
-namespace paddle {
-
-ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
-    Projection::registrar_;
-
-Projection* Projection::create(const ProjectionConfig& config,
-                               ParameterPtr parameter,
-                               bool useGpu) {
-  return registrar_.createByType(config.type(), config, parameter, useGpu);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Projection.h b/paddle/legacy/gserver/layers/Projection.h
deleted file mode 100644
index 974f5a2ca..000000000
--- a/paddle/legacy/gserver/layers/Projection.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/parameter/Parameter.h"
-
-namespace paddle {
-
-// Macro for registering a projection type
-// Example: REGISTER_LAYER(fc, FullMatrixProjection);
-#define REGISTER_PROJECTION(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                 \
-    Projection::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
-  static InitFunction __reg_type_##__type_name([]() {                   \
-    Projection::registrar_.registerClass(#__type_name, createFunction); \
-  })
-
-/**
- * A projection takes one Argument as input, calculate the result and add it
- * to output Argument.
- */
-class Projection {
- public:
-  static Projection* create(const ProjectionConfig& config,
-                            ParameterPtr parameter,
-                            bool useGpu);
-
-  Projection(const ProjectionConfig& config,
-             ParameterPtr parameter,
-             bool useGpu)
-      : config_(config), parameter_(parameter), useGpu_(useGpu) {}
-
-  virtual ~Projection() {}
-
-  const std::string& getName() const { return config_.name(); }
-
-  /// Register a projection
-  static ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
-      registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param in input of projection
-   * @param out output of projection
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(const Argument* in, const Argument* out, PassType passType) {
-    in_ = in;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward(const UpdateCallback& callback) = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state. A copy of internal state is returned.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * init forward_ and backward_ functions
-   */
-  virtual bool init() { return true; }
-
-  /**
-   * Get output size of projection.
-   */
-  size_t getOutputSize() const { return config_.output_size(); }
-
- protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
- protected:
-  /// Config of projection
-  ProjectionConfig config_;
-  /// Parameter of projection
-  ParameterPtr parameter_;
-  bool useGpu_;
-
-  /// Store `in` passed to forward()
-  const Argument* in_;
-  /// Store `out` passed to forward()
-  const Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ROIPoolLayer.cpp b/paddle/legacy/gserver/layers/ROIPoolLayer.cpp
deleted file mode 100644
index b5cbc0c70..000000000
--- a/paddle/legacy/gserver/layers/ROIPoolLayer.cpp
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ROIPoolLayer.h"
-#include <cfloat>
-
-namespace paddle {
-
-REGISTER_LAYER(roi_pool, ROIPoolLayer);
-
-bool ROIPoolLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
-  pooledWidth_ = layerConf.pooled_width();
-  pooledHeight_ = layerConf.pooled_height();
-  spatialScale_ = layerConf.spatial_scale();
-
-  return true;
-}
-
-void ROIPoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
-  height_ = getInput(0).getFrameHeight();
-  if (!height_) height_ = layerConf.height();
-  width_ = getInput(0).getFrameWidth();
-  if (!width_) width_ = layerConf.width();
-  channels_ = getInputValue(0)->getWidth() / width_ / height_;
-
-  size_t batchSize = getInput(0).getBatchSize();
-  size_t numROIs = getInput(1).getBatchSize();
-
-  MatrixPtr dataValue = getInputValue(0);
-  MatrixPtr roiValue = getInputValue(1);
-  resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
-  MatrixPtr outputValue = getOutputValue();
-
-  if (useGpu_) {  // TODO(guosheng): implement on GPU later
-    MatrixPtr dataCpuBuffer;
-    Matrix::resizeOrCreate(dataCpuBuffer,
-                           dataValue->getHeight(),
-                           dataValue->getWidth(),
-                           false,
-                           false);
-    MatrixPtr roiCpuBuffer;
-    Matrix::resizeOrCreate(roiCpuBuffer,
-                           roiValue->getHeight(),
-                           roiValue->getWidth(),
-                           false,
-                           false);
-    dataCpuBuffer->copyFrom(*dataValue);
-    roiCpuBuffer->copyFrom(*roiValue);
-    dataValue = dataCpuBuffer;
-    roiValue = roiCpuBuffer;
-    MatrixPtr outputCpuBuffer;
-    Matrix::resizeOrCreate(outputCpuBuffer,
-                           outputValue->getHeight(),
-                           outputValue->getWidth(),
-                           false,
-                           false);
-    outputCpuBuffer->copyFrom(*outputValue);
-    outputValue = outputCpuBuffer;
-  }
-
-  real* bottomData = dataValue->getData();
-  size_t batchOffset = dataValue->getWidth();
-  size_t channelOffset = height_ * width_;
-  real* bottomROIs = roiValue->getData();
-  size_t roiOffset = roiValue->getWidth();
-  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
-
-  real* outputData = outputValue->getData();
-  real* argmaxData = nullptr;
-  if (passType != PASS_TEST) {
-    Matrix::resizeOrCreate(maxIdxs_,
-                           numROIs,
-                           channels_ * pooledHeight_ * pooledWidth_,
-                           false,
-                           false);
-    argmaxData = maxIdxs_->getData();
-  }
-
-  for (size_t n = 0; n < numROIs; ++n) {
-    // the first five elememts of each RoI should be:
-    // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end
-    size_t roiBatchIdx = bottomROIs[0];
-    size_t roiStartW = round(bottomROIs[1] * spatialScale_);
-    size_t roiStartH = round(bottomROIs[2] * spatialScale_);
-    size_t roiEndW = round(bottomROIs[3] * spatialScale_);
-    size_t roiEndH = round(bottomROIs[4] * spatialScale_);
-    CHECK_GE(roiBatchIdx, 0UL);
-    CHECK_LT(roiBatchIdx, batchSize);
-    size_t roiHeight =
-        std::max(roiEndH - roiStartH + 1, static_cast<size_t>(1));
-    size_t roiWidth = std::max(roiEndW - roiStartW + 1, static_cast<size_t>(1));
-    real binSizeH =
-        static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
-    real binSizeW =
-        static_cast<real>(roiWidth) / static_cast<real>(pooledWidth_);
-    real* batchData = bottomData + batchOffset * roiBatchIdx;
-    for (size_t c = 0; c < channels_; ++c) {
-      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
-        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
-          size_t hstart = static_cast<size_t>(std::floor(ph * binSizeH));
-          size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
-          size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
-          size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
-          hstart = std::min(
-              std::max(hstart + roiStartH, static_cast<size_t>(0)), height_);
-          wstart = std::min(
-              std::max(wstart + roiStartW, static_cast<size_t>(0)), width_);
-          hend = std::min(std::max(hend + roiStartH, static_cast<size_t>(0)),
-                          height_);
-          wend = std::min(std::max(wend + roiStartW, static_cast<size_t>(0)),
-                          width_);
-
-          bool isEmpty = (hend <= hstart) || (wend <= wstart);
-          size_t poolIndex = ph * pooledWidth_ + pw;
-          outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
-          if (argmaxData) {
-            argmaxData[poolIndex] = -1;
-          }
-
-          for (size_t h = hstart; h < hend; ++h) {
-            for (size_t w = wstart; w < wend; ++w) {
-              size_t index = h * width_ + w;
-              if (batchData[index] > outputData[poolIndex]) {
-                outputData[poolIndex] = batchData[index];
-                if (argmaxData) {
-                  argmaxData[poolIndex] = index;
-                }
-              }
-            }
-          }
-        }
-      }
-      batchData += channelOffset;
-      outputData += poolChannelOffset;
-      if (argmaxData) {
-        argmaxData += poolChannelOffset;
-      }
-    }
-    bottomROIs += roiOffset;
-  }
-  if (useGpu_) {
-    getOutputValue()->copyFrom(*outputValue);
-  }
-}
-
-void ROIPoolLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inGradValue = getInputGrad(0);
-  MatrixPtr outGradValue = getOutputGrad();
-  MatrixPtr roiValue = getInputValue(1);
-
-  if (useGpu_) {
-    MatrixPtr inGradCpuBuffer;
-    Matrix::resizeOrCreate(inGradCpuBuffer,
-                           inGradValue->getHeight(),
-                           inGradValue->getWidth(),
-                           false,
-                           false);
-    MatrixPtr outGradCpuBuffer;
-    Matrix::resizeOrCreate(outGradCpuBuffer,
-                           outGradValue->getHeight(),
-                           outGradValue->getWidth(),
-                           false,
-                           false);
-    MatrixPtr roiCpuBuffer;
-    Matrix::resizeOrCreate(roiCpuBuffer,
-                           roiValue->getHeight(),
-                           roiValue->getWidth(),
-                           false,
-                           false);
-    inGradCpuBuffer->copyFrom(*inGradValue);
-    outGradCpuBuffer->copyFrom(*outGradValue);
-    roiCpuBuffer->copyFrom(*roiValue);
-    inGradValue = inGradCpuBuffer;
-    outGradValue = outGradCpuBuffer;
-    roiValue = roiCpuBuffer;
-  }
-
-  real* bottomROIs = roiValue->getData();
-  size_t numROIs = getInput(1).getBatchSize();
-  size_t roiOffset = getInputValue(1)->getWidth();
-
-  real* inDiffData = inGradValue->getData();
-  size_t batchOffset = getInputValue(0)->getWidth();
-  size_t channelOffset = height_ * width_;
-
-  real* outDiffData = outGradValue->getData();
-  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
-  real* argmaxData = maxIdxs_->getData();
-
-  for (size_t n = 0; n < numROIs; ++n) {
-    size_t roiBatchIdx = bottomROIs[0];
-    real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
-    for (size_t c = 0; c < channels_; ++c) {
-      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
-        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
-          size_t poolIndex = ph * pooledWidth_ + pw;
-          if (argmaxData[poolIndex] > 0) {
-            size_t index = static_cast<size_t>(argmaxData[poolIndex]);
-            batchDiffData[index] += outDiffData[poolIndex];
-          }
-        }
-      }
-      batchDiffData += channelOffset;
-      outDiffData += poolChannelOffset;
-      argmaxData += poolChannelOffset;
-    }
-    bottomROIs += roiOffset;
-  }
-
-  if (useGpu_) {
-    getInputGrad(0)->copyFrom(*inGradValue);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ROIPoolLayer.h b/paddle/legacy/gserver/layers/ROIPoolLayer.h
deleted file mode 100644
index 801a9b3ae..000000000
--- a/paddle/legacy/gserver/layers/ROIPoolLayer.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
- * feature map.
- * - Input: This layer needs two input layers: The first input layer is a
- *          convolution layer; The second input layer contains the ROI data
- *          which is the output of ProposalLayer in Faster R-CNN. layers for
- *          generating bbox location offset and the classification confidence.
- * - Output: The ROIs' feature map.
- * Reference:
- *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
- *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
- * Networks
- */
-
-class ROIPoolLayer : public Layer {
- protected:
-  size_t channels_;
-  size_t width_;
-  size_t height_;
-  size_t pooledWidth_;
-  size_t pooledHeight_;
-  real spatialScale_;
-
-  // Since there is no int matrix, use real maxtrix instead.
-  MatrixPtr maxIdxs_;
-
- public:
-  explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RecurrentLayer.cpp b/paddle/legacy/gserver/layers/RecurrentLayer.cpp
deleted file mode 100644
index 3fc5bd15e..000000000
--- a/paddle/legacy/gserver/layers/RecurrentLayer.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RecurrentLayer.h"
-
-DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
-
-namespace paddle {
-
-REGISTER_LAYER(recurrent, RecurrentLayer);
-
-bool RecurrentLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize());
-  weight_.reset(new Weight(getSize(), getSize(), parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize(), biasParameter_));
-  }
-  reversed_ = config_.reversed();
-  return true;
-}
-
-void RecurrentLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed recurrent layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->zeroMem();
-}
-
-void RecurrentLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1) << "one matrix is expected for RNN state";
-  prevOutput_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr RecurrentLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-  res->value[0]->copyFrom(*prevOutput_);
-  return res;
-}
-
-void RecurrentLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str());
-  Layer::forward(passType);
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize(), input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  output_.value->assign(*input.value);
-  if (bias_) {
-    output_.value->addBias(*bias_->getW(), 1);
-  }
-  if (!FLAGS_rnn_use_batch) {
-    forwardSequence(batchSize, numSequences, starts);
-  } else {
-    forwardBatch(batchSize, numSequences, starts);
-  }
-}
-
-void RecurrentLayer::forwardSequence(int batchSize,
-                                     size_t numSequences,
-                                     const int* starts) {
-  REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str());
-  frameOutput_.reserve(batchSize);
-  for (int i = frameOutput_.size(); i < batchSize; ++i) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               getSize(),
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              getSize(),
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutput_.push_back(arg);
-  }
-
-  for (int i = 0; i < batchSize; ++i) {
-    frameOutput_[i].value->setData(output_.value->getData() + i * getSize());
-  }
-
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t i = 0; i < numSequences; ++i) {
-    forwardOneSequence(starts[i], starts[i + 1] - starts[i]);
-  }
-}
-
-void RecurrentLayer::forwardOneSequence(int start, int length) {
-  if (!reversed_) {
-    if (prevOutput_) {
-      frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
-    }
-    activation_->forward(frameOutput_[start]).check();
-
-    for (int i = 1; i < length; ++i) {
-      frameOutput_[start + i].value->mul(
-          *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]).check();
-    }
-    if (prevOutput_) {
-      prevOutput_->assign(*frameOutput_[start + length - 1].value);
-    }
-  } else {
-    activation_->forward(frameOutput_[start + length - 1]).check();
-    for (int i = length - 2; i >= 0; --i) {
-      frameOutput_[start + i].value->mul(
-          *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]).check();
-    }
-  }
-}
-
-void RecurrentLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str());
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  if (!FLAGS_rnn_use_batch) {
-    backwardSequence(batchSize, numSequences, starts);
-  } else {
-    backwardBatch(batchSize, numSequences, starts);
-  }
-
-  if (input.grad) {
-    input.grad->add(*output_.grad);
-  }
-
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*output_.grad, 1);
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void RecurrentLayer::backwardSequence(int batchSize,
-                                      size_t numSequences,
-                                      const int* starts) {
-  REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str());
-  for (int i = 0; i < batchSize; ++i) {
-    frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize());
-  }
-
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t i = 0; i < numSequences; ++i) {
-    backwardOneSequence(starts[i], starts[i + 1] - starts[i]);
-  }
-}
-
-void RecurrentLayer::backwardOneSequence(int start, int length) {
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-  if (!reversed_) {
-    for (int i = length - 1; i > 0; --i) {
-      activation_->backward(frameOutput_[start + i]).check();
-      frameOutput_[start + i - 1].grad->mul(
-          *frameOutput_[start + i].grad, *weightT, 1, 1);
-    }
-    activation_->backward(frameOutput_[start]).check();
-    if (weight_->getWGrad()) {
-      weight_->getWGrad()->mul(
-          *output_.value->subMatrix(start, length - 1)->getTranspose(),
-          *output_.grad->subMatrix(start + 1, length - 1),
-          1,
-          1);
-    }
-  } else {
-    for (int i = 0; i < length - 1; ++i) {
-      activation_->backward(frameOutput_[start + i]).check();
-      frameOutput_[start + i + 1].grad->mul(
-          *frameOutput_[start + i].grad, *weightT, 1, 1);
-    }
-    activation_->backward(frameOutput_[start + length - 1]).check();
-    if (weight_->getWGrad()) {
-      weight_->getWGrad()->mul(
-          *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-          *output_.grad->subMatrix(start, length - 1),
-          1,
-          1);
-    }
-  }
-}
-
-void RecurrentLayer::forwardBatch(int batchSize,
-                                  size_t numSequences,
-                                  const int* starts) {
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
-
-  batchValue_->copyFromSeq(*output_.value);
-  {
-    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
-    AsyncGpuBlock asyncGpuBlock;
-    /* forward one batch */
-    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
-      MatrixPtr batch2 = batchValue_->getBatchValue(n);
-
-      if (n != 0) {
-        MatrixPtr batch1 =
-            batchValue_->getBatchValue(n - 1, batch2->getHeight());
-        batch2->mul(*batch1, *weight_->getW(), 1, 1);
-      }
-      Argument arg;
-      arg.value = batch2;
-      activation_->forward(arg).check();
-    }
-  }
-  batchValue_->copyBackSeq(*output_.value);
-}
-
-void RecurrentLayer::backwardBatch(int batchSize,
-                                   size_t numSequences,
-                                   const int* starts) {
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  size_t numBatch = batchGrad_->getNumBatch();
-  bool backwardByBatch = numBatch < numSequences;
-
-  batchGrad_->copyFromSeq(*output_.grad);
-  {
-    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
-    MatrixPtr weightT = weight_->getW()->getTranspose();
-    AsyncGpuBlock asyncGpuBlock;
-    /* backward one batch */
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr batch2 = batchGrad_->getBatchValue(n);
-      MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight());
-
-      Argument arg;
-      arg.value = batch1;
-      arg.grad = batch2;
-      activation_->backward(arg).check();
-
-      if (n != 0) {
-        batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
-        batch1->mul(*batch2, *weightT, 1, 1);
-      }
-
-      if (backwardByBatch && weight_->getWGrad()) {
-        if (n != 0) {
-          /* backward weight */
-          batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight());
-          weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1);
-        }
-      }
-    }
-  }
-
-  batchGrad_->copyBackSeq(*output_.grad);
-
-  if (!backwardByBatch && weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t seq = 0; seq < numSequences; ++seq) {
-      int len = starts[seq + 1] - starts[seq];
-      if (!reversed_) {
-        weight_->getWGrad()->mul(
-            *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(),
-            *output_.grad->subMatrix(starts[seq] + 1, len - 1),
-            1,
-            1);
-      } else {
-        weight_->getWGrad()->mul(
-            *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(),
-            *output_.grad->subMatrix(starts[seq], len - 1),
-            1,
-            1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RecurrentLayer.h b/paddle/legacy/gserver/layers/RecurrentLayer.h
deleted file mode 100644
index 287ea27a0..000000000
--- a/paddle/legacy/gserver/layers/RecurrentLayer.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <gflags/gflags.h>
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief RecurrentLayer takes 1 input layer. The output size is the same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
- */
-
-class RecurrentLayer : public Layer {
- public:
-  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
- protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  virtual void forwardBatch(int batchSize,
-                            size_t numSequences,
-                            const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  virtual void backwardBatch(int batchSize,
-                             size_t numSequences,
-                             const int* starts);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
deleted file mode 100644
index 393212459..000000000
--- a/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include "paddle/legacy/gserver/layers/Layer.h"
-
-#include "paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * Recurrent layer group is a group of layers, which forward/backward one frame
- * after previous frame forward/backward through all layers in layer group.
- * It's automatically added by config_parser if some layers are defined
- * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
- */
-class RecurrentLayerGroup : public Layer {
- public:
-  explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
-
-  void initSubNetwork(NeuralNetwork* rootNetwork,
-                      const ModelConfig& config,
-                      const std::vector<ParameterType>& parameterTypes,
-                      bool useGpu) override;
-
-  void forward(PassType passType) override {
-    REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    network_->forward(inArgs, &outArgs, passType);
-  }
-  void backward(const UpdateCallback& callback) override {
-    REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
-    network_->backward(nullptr);
-
-    for (auto& para : parameters_) {
-      para->incUpdate(callback);
-    }
-  }
-
-  /**
-   * @see Layer.accessSubNetwork
-   */
-  void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) override {
-    callback(*network_);
-  }
-
- private:
-  std::unique_ptr<RecurrentGradientMachine> network_;
-};
-
-REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
-
-void RecurrentLayerGroup::initSubNetwork(
-    NeuralNetwork* rootNetwork,
-    const ModelConfig& config,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  setNeedGradient(true);
-
-  network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
-  ParamInitCallback cb = [rootNetwork](int paramId, Parameter* para) {
-    para->enableSharedType(
-        PARAMETER_VALUE,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_GRADIENT),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-  network_->init(config, cb, parameterTypes, useGpu);
-
-  for (auto paramId : network_->getParameterIds()) {
-    ParameterPtr parameter = rootNetwork->getParameters()[paramId];
-    parameter->incShared();
-    CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-    parameters_.push_back(parameter);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ResizeLayer.cpp b/paddle/legacy/gserver/layers/ResizeLayer.cpp
deleted file mode 100644
index 8f8aad820..000000000
--- a/paddle/legacy/gserver/layers/ResizeLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for resizing a minibatch matrix h*w to h'*w'
- * @note
- * origin matrix height * width)
- * resize matrix: (height * width / size) * size
- */
-class ResizeLayer : public Layer {
- public:
-  explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-};
-
-REGISTER_LAYER(resize, ResizeLayer);
-
-bool ResizeLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void ResizeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-  CHECK_EQ((height * width) % getSize(), 0UL);
-
-  reserveOutput(height * width / getSize(), getSize());
-  MatrixPtr tmp =
-      Matrix::create(output_.value->getData(), height, width, false, useGpu_);
-  tmp->assign(*input.value);
-}
-
-void ResizeLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-
-  if (!input.grad) {
-    return;
-  }
-
-  MatrixPtr tmp = Matrix::create(input.grad->getData(),
-                                 height * width / getSize(),
-                                 getSize(),
-                                 false,
-                                 useGpu_);
-  tmp->add(*output_.grad);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RotateLayer.cpp b/paddle/legacy/gserver/layers/RotateLayer.cpp
deleted file mode 100644
index f205d1a91..000000000
--- a/paddle/legacy/gserver/layers/RotateLayer.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RotateLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(rotate, RotateLayer);
-
-bool RotateLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  height_ = config_.height();
-  width_ = config_.width();
-  CHECK_GT(height_, 0);
-  CHECK_GT(width_, 0);
-  return true;
-}
-
-void RotateLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr input = getInputValue(0);
-  batchSize_ = input->getHeight();
-  size_ = input->getWidth();
-  CHECK_GE(size_, height_ * width_);
-  CHECK_EQ(size_ % (height_ * width_), 0)
-      << "total size_ is not dividable by (height_ * width_), i.e., "
-      << "channel number should be an integer";
-  channels_ = size_ / (height_ * width_);
-
-  resizeOutput(batchSize_, size_);
-
-  MatrixPtr outV = getOutputValue();
-  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
-    for (int c = 0; c < channels_; c++) {  // for each feat channel
-      MatrixPtr inputSample =
-          Matrix::create(input->getData() + b * size_ + c * height_ * width_,
-                         height_,
-                         width_,
-                         false,
-                         useGpu_);
-      MatrixPtr outputSample =
-          Matrix::create(outV->getData() + b * size_ + c * height_ * width_,
-                         width_,
-                         height_,
-                         false,
-                         useGpu_);
-      inputSample->rotate(outputSample, false, true /* clock-wise */);
-    }
-  }
-
-  if (getInputGrad(0)) {
-    zeroGrad();
-  }
-}
-
-void RotateLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr outputGrad = getOutputGrad();
-  if (outputGrad == NULL) {
-    return;
-  }
-  // the grad should be rotated in the reverse direction
-  MatrixPtr preGrad = getInputGrad(0);
-
-  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
-    for (int c = 0; c < channels_; c++) {  // for each feat channel
-      MatrixPtr inputSampleGrad =
-          Matrix::create(preGrad->getData() + b * size_ + c * height_ * width_,
-                         height_,
-                         width_,
-                         false,
-                         useGpu_);
-      MatrixPtr outputSampleGrad = Matrix::create(
-          outputGrad->getData() + b * size_ + c * height_ * width_,
-          width_,
-          height_,
-          false,
-          useGpu_);
-      MatrixPtr tmpGrad = nullptr;
-      outputSampleGrad->rotate(tmpGrad, true, false /* anti clock-wise */);
-      inputSampleGrad->add(*tmpGrad);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RotateLayer.h b/paddle/legacy/gserver/layers/RotateLayer.h
deleted file mode 100644
index 498e24372..000000000
--- a/paddle/legacy/gserver/layers/RotateLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
- * domain
- * The rotation is 90 degrees in clock-wise for each channel
- * \f[
- *   y(j,i,:) = x(M-i-1,j,:)
- * \f]
- * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
- *
- * The config file api is rotate_layer
- *
- */
-
-class RotateLayer : public Layer {
- public:
-  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
-
- private:
-  int batchSize_;
-  int size_;
-  int height_;
-  int width_;
-  int channels_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowConvLayer.cpp b/paddle/legacy/gserver/layers/RowConvLayer.cpp
deleted file mode 100644
index 1961557dc..000000000
--- a/paddle/legacy/gserver/layers/RowConvLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(row_conv, RowConvLayer);
-
-bool RowConvLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
-  createFunction(forward_, "RowConv", FuncConfig());
-  createFunction(backward_, "RowConvGrad", FuncConfig());
-
-  return true;
-}
-
-void RowConvLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr input = getInputValue(0);
-  size_t height = input->getHeight();
-  size_t width = input->getWidth();
-  CHECK_EQ(width, getSize());
-  resetOutput(height, width);
-
-  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
-  MatrixPtr w = weight_->getW();
-  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
-
-  MatrixPtr outV = getOutputValue();
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), *startPos);
-  inputs.addArg(*w, wDims_);
-  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
-
-  {
-    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
-    forward_[0]->calc(inputs, outputs);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void RowConvLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), *startPos);
-  inputs.addArg(*getInputValue(0), *startPos);
-  inputs.addArg(*weight_->getW(), wDims_);
-
-  MatrixPtr inGrad = getInputGrad(0);
-  MatrixPtr wGrad = weight_->getWGrad();
-  size_t h = getInputValue(0)->getHeight();
-  size_t w = getInputValue(0)->getWidth();
-  outputs.addArg(
-      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
-      *startPos,
-      ADD_TO);
-  outputs.addArg(
-      wGrad ? (*wGrad)
-            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
-      wDims_,
-      ADD_TO);
-
-  {
-    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
-    backward_[0]->calc(inputs, outputs);
-  }
-
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowConvLayer.h b/paddle/legacy/gserver/layers/RowConvLayer.h
deleted file mode 100644
index 3b74df0b1..000000000
--- a/paddle/legacy/gserver/layers/RowConvLayer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief Row Convolution Layer.
- */
-class RowConvLayer : public Layer {
- public:
-  explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~RowConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- protected:
-  // Row convolution weight, context_lenght_ * fan_out.
-  // fan_out is the size of output feature.
-  std::unique_ptr<Weight> weight_;
-
-  // The step number to look ahead plus one equals contexLength_.
-  size_t contexLength_;
-  TensorShape wDims_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowL2NormLayer.cpp b/paddle/legacy/gserver/layers/RowL2NormLayer.cpp
deleted file mode 100644
index d5e6e10a0..000000000
--- a/paddle/legacy/gserver/layers/RowL2NormLayer.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer for L2 normalization in each row,
- * \f[
- *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
- * \f]
- * where the size of \f$in\f$ is (batchSize x dataDim),
- * and the size of \f$out\f$ is (batchSize x dataDim).
- */
-
-class RowL2NormLayer : public Layer {
- protected:
-  MatrixPtr inSquare_;
-  MatrixPtr l2NormReciprocal_;
-  MatrixPtr dotSum_;
-
- public:
-  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
-
-bool RowL2NormLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void RowL2NormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = getSize();
-  CHECK_EQ(dataDim, inV->getWidth());
-  resetOutput(batchSize, dataDim);
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
-  inV->square2(*inSquare_);
-  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
-  inSquare_->rowSum(*l2NormReciprocal_);
-  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
-  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
-  outV->rowScale(0, *inV, *l2NormReciprocal_);
-}
-
-void RowL2NormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-  size_t batchSize = inV->getHeight();
-
-  // inG[ij] += outG[ij] / l2NormReciprocal
-  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
-  // inV[i])
-  if (inG) {
-    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
-    dotSum_->zeroMem();
-    dotSum_->rowDotMul(0, *outG, *outV);
-    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
-    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
-    inSquare_->rowScale(0, *inV, *dotSum_);
-    inG->sub(*inSquare_);
-    inG->addRowScale(0, *outG, *l2NormReciprocal_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SamplingIdLayer.cpp b/paddle/legacy/gserver/layers/SamplingIdLayer.cpp
deleted file mode 100644
index dbce63588..000000000
--- a/paddle/legacy/gserver/layers/SamplingIdLayer.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <random>
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for sampling id from multinomial distribution from the
- * input layer. Sampling one id for one sample. The result is stored in
- * output_.ids.
- *
- * The config file api is sampling_id_layer.
- */
-class SamplingIdLayer : public Layer {
-  /// Produces random floating-point values, uniformly distributed on [0, 1).
-  std::uniform_real_distribution<double> rand1_;
-  std::vector<Argument> tmpCpuInput_;
-
- public:
-  explicit SamplingIdLayer(const LayerConfig& config)
-      : Layer(config), rand1_(0, 1) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-    if (useGpu_) {
-      tmpCpuInput_.reserve(inputLayers_.size());
-      for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_.push_back(Argument());
-      }
-    }
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    if (useGpu_) {
-      for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_[i].resizeAndCopyFrom(
-            getInput(i), false, HPPL_STREAM_DEFAULT);
-      }
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      forwardImp(tmpCpuInput_[0]);
-    } else {
-      forwardImp(getInput(0));
-    }
-  }
-
-  void forwardImp(const Argument& input) {
-    size_t batchSize = input.getBatchSize();
-    IVector::resizeOrCreate(output_.ids, batchSize, useGpu_);
-    real* buf = input.value->getData();
-    int dim = input.value->getWidth();
-    std::vector<int> ids(batchSize);
-    auto& reng = ThreadLocalRandomEngine::get();
-    for (size_t i = 0; i < batchSize; ++i) {
-      double r = rand1_(reng);
-      int id = dim - 1;
-      for (int j = 0; j < dim; ++j) {
-        if ((r -= buf[i * dim + j]) < 0) {
-          id = j;
-          break;
-        }
-      }
-      ids[i] = id;
-    }
-    output_.ids->copyFrom(ids.data(), batchSize);
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(sampling_id, SamplingIdLayer);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp b/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
deleted file mode 100644
index 8af78a2e2..000000000
--- a/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer applies a linear transformation to each element in each row of
- * the input matrix. For each element, the layer first re-scale it and then
- * adds a bias to it.
- *
- * \f[
- *    y = wx + b
- * \f]
- *
- * Here, w is the scale and b is the bias. Both w and b are trainable scalars.
- *
- */
-
-class ScaleShiftLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> scale_;
-  std::unique_ptr<Weight> offset_;
-
- public:
-  explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(scale_shift, ScaleShiftLayer);
-
-bool ScaleShiftLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 1U);
-  scale_.reset(new Weight(1, 1, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
-  }
-  return true;
-}
-
-void ScaleShiftLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-  resetOutput(inV->getHeight(), inV->getWidth());
-  MatrixPtr outV = getOutputValue();
-  real scaleValue = scale_->getW()->getElement(0, 0);
-  outV->mulScalar(*inV, scaleValue);
-  if (offset_) {
-    real offsetValue = offset_->getW()->getElement(0, 0);
-    outV->add(offsetValue);
-  }
-}
-
-void ScaleShiftLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  /* Calculate the parameter gradient for the current layer */
-  if (scale_->getWGrad()) {
-    MatrixPtr rowSumMtx;
-    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
-    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
-    rowSumMtx->sumOfProducts(
-        /* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
-    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
-    scale_->getWGrad()->sumCols(
-        /* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
-    scale_->getParameterPtr()->incUpdate(callback);
-  }
-  if (offset_ && offset_->getWGrad()) {
-    MatrixPtr rowSumMtx;
-    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
-    rowSumMtx->sumRows(*outG, 1., 0.);
-    offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
-    offset_->getParameterPtr()->incUpdate(callback);
-  }
-
-  /* Calculate the input layers error */
-  if (inG) {
-    real scaleValue = scale_->getW()->getElement(0, 0);
-    inG->add(*outG, scaleValue);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
deleted file mode 100644
index 70d44d2a7..000000000
--- a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-namespace paddle {
-
-REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
-
-bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
-  auto& conf = config_.inputs(0).scale_sub_region_conf();
-  value_ = conf.value();
-
-  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
-  createFunction(
-      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
-
-  return true;
-}
-
-void ScaleSubRegionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  auto in0 = getInput(0);
-  imgH_ = in0.getFrameHeight();
-  imgW_ = in0.getFrameWidth();
-  if (imgH_ == 0 || imgW_ == 0) {
-    auto& conf = config_.inputs(0).scale_sub_region_conf();
-    imgH_ = conf.image_conf().img_size_y();
-    imgW_ = conf.image_conf().img_size();
-  }
-  MatrixPtr imgV = in0.value;
-  size_t batchSize = imgV->getHeight();
-  size_t spatialSize = imgH_ * imgW_;
-  channelsNum_ = imgV->getWidth() / spatialSize;
-  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
-
-  resetOutput(batchSize, imgV->getWidth());
-  auto& out = getOutput();
-  out.setFrameHeight(imgH_);
-  out.setFrameWidth(imgW_);
-
-  MatrixPtr indicesV = getInputValue(1);
-  indicesShape_ = TensorShape({batchSize, 6});
-
-  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
-  BufferArgs inArgs;
-  BufferArgs outArgs;
-  inArgs.addArg(*imgV, shape_);
-  inArgs.addArg(*indicesV, indicesShape_);
-  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
-  forward_[0]->calc(inArgs, outArgs);
-}
-
-void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
-  BufferArgs inArgs;
-  BufferArgs outArgs;
-  inArgs.addArg(*getOutputGrad(), shape_);
-  inArgs.addArg(*getInputValue(1), indicesShape_);
-  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
-  backward_[0]->calc(inArgs, outArgs);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
deleted file mode 100644
index fe431698b..000000000
--- a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  For each instance, this layer can be used to multiply a value to a
- *         specified sub continuous region. By providing start index and end
- *         index for C/H/W, you can specify the location and shape of the
- *         region.
- *
- *         input_0: Input value.
- *         input_1: Indices value to specify the location an shape of the
- *                  region.
- */
-class ScaleSubRegionLayer : public Layer {
- public:
-  explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ScaleSubRegionLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr);
-
- protected:
-  TensorShape shape_;
-  TensorShape indicesShape_;
-  size_t imgH_;
-  size_t imgW_;
-  size_t channelsNum_;
-  real value_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScalingLayer.cpp b/paddle/legacy/gserver/layers/ScalingLayer.cpp
deleted file mode 100644
index a8286b661..000000000
--- a/paddle/legacy/gserver/layers/ScalingLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for each row of a matrix, multiplying with a element of a vector,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x.row[i]
- * \f]
- * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
- * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is scaling_layer.
- */
-
-class ScalingLayer : public Layer {
- public:
-  explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ScalingLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(scaling, ScalingLayer);
-
-bool ScalingLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ScalingLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(weightV->getWidth(), 1U);
-  CHECK_EQ(weightV->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwScalingTimer", getName().c_str());
-    // outV += inV1 * weight
-    outV->addRowScale(0, *inV1, *weightV);
-  }
-}
-
-void ScalingLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwScalingTimer", getName().c_str());
-
-    if (inG0) {
-      // inG0 += outG .* inV1
-      inG0->rowDotMul(0, *outG, *inV1);
-    }
-
-    if (inG1) {
-      // inG1 += outG * weight;
-      inG1->addRowScale(0, *outG, *weightV);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScalingProjection.cpp b/paddle/legacy/gserver/layers/ScalingProjection.cpp
deleted file mode 100644
index 4d871cafc..000000000
--- a/paddle/legacy/gserver/layers/ScalingProjection.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-namespace paddle {
-
-class ScalingProjection : public Projection {
- public:
-  ScalingProjection(const ProjectionConfig& config,
-                    const ParameterPtr& parameter,
-                    bool useGpu)
-      : Projection(config, parameter, useGpu) {
-    CHECK_EQ(parameter->getSize(), 1UL);
-    weight_.reset(new Weight(1, 1, parameter));
-  }
-
-  void forward() {
-    CHECK(in_->value);
-    out_->value->add(*in_->value, weight_->getW()->getElement(0, 0));
-  }
-
-  void backward(const UpdateCallback& callback) {
-    if (weight_->getWGrad()) {
-      auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_);
-      sum->sumOfProducts(*in_->value,
-                         *out_->grad,
-                         /* scaleSum= */ 1,
-                         /* scaleDest= */ 0);
-      weight_->getWGrad()->sumCols(*sum,
-                                   /* scaleSum= */ 1,
-                                   /* scaleDest= */ 1);
-      parameter_->incUpdate(callback);
-    }
-    if (in_->grad) {
-      in_->grad->add(*out_->grad, weight_->getW()->getElement(0, 0));
-    }
-  }
-
- protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(scaling, ScalingProjection);
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
deleted file mode 100644
index 72fb06814..000000000
--- a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SelectiveFullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(selective_fc, SelectiveFullyConnectedLayer);
-
-bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  inputNum_ = inputLayers_.size();
-  if (config_.has_selected_colums()) {
-    inputNum_ -= 1;
-  }
-  for (size_t i = 0; i < inputNum_; i++) {
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-    // NOTE weight is transpoed
-    weights_.emplace_back(new Weight(width, height, parameters_[i]));
-  }
-
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  fullOutput_ = false;
-
-  return true;
-}
-
-void SelectiveFullyConnectedLayer::prefetch() {}
-
-void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
-                                                 size_t width,
-                                                 size_t nnz) {
-  bool flag = (passType_ == PASS_TEST &&
-               config_.selective_fc_pass_generation() && !fullOutput_);
-  SetDevice device(output_.deviceId);
-  if (flag) {
-    // output_.value is sparse matrix
-    if (dynamic_cast<CpuMatrix*>(output_.value.get()) ||
-        dynamic_cast<GpuMatrix*>(output_.value.get())) {
-      output_.value = nullptr;
-    }
-    Matrix::resizeOrCreateSparseMatrix(output_.value,
-                                       height,
-                                       width,
-                                       nnz,
-                                       FLOAT_VALUE,
-                                       SPARSE_CSR,
-                                       /*trans=*/false,
-                                       /*useGpu=*/useGpu_);
-    output_.value->copyFrom(*selCols_);
-    interOutput_ = output_.value;
-  } else {
-    if (fullOutput_) {
-      // output_.value is dense matrix
-      if (dynamic_cast<CpuSparseMatrix*>(output_.value.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
-        output_.value = nullptr;
-      }
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             width,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = output_.value;
-    } else {
-      // output_.value is dense matrix, but width = nnz /height
-      CHECK_EQ(nnz % height, 0U);
-      CHECK(nnz / height);
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             nnz / height,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
-                                                selCols_->getRows(),
-                                                selCols_->getCols(),
-                                                height,
-                                                width,
-                                                nnz,
-                                                FLOAT_VALUE,
-                                                SPARSE_CSR,
-                                                /*trans=*/false,
-                                                /*useGpu=*/useGpu_);
-    }
-  }
-  interOutput_->zeroMem();
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    CHECK_EQ(nnz % height, 0U) << "during training, each sample must have a "
-                                  "same number of selected columns.";
-    CHECK(nnz / height)
-        << "during training, "
-           "each sample must have at least one column selected.";
-    Matrix::resizeOrCreate(output_.grad,
-                           height,
-                           nnz / height,
-                           /*trans=*/false,
-                           /*useGpu=*/useGpu_);
-    output_.grad->zeroMem();
-  }
-}
-
-void SelectiveFullyConnectedLayer::forward(PassType passType) {
-  REGISTER_TIMER("selective_fc.forward");
-  Layer::forward(passType);
-
-  getSelectiveCols();
-  size_t height = getInput(0).getBatchSize();
-  size_t width = getSize();
-  size_t nnz = height * width;
-  if (!fullOutput_) {
-    CHECK(selCols_);
-    CHECK(height == selCols_->getHeight());
-    CHECK(width == selCols_->getWidth());
-    nnz = selCols_->getElementCnt();
-  }
-
-  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
-  // this outV should be used as input of MaxIdLayer and softmax activation
-  reserveOutput(height, width, nnz);
-
-  bool flag = true;
-  for (size_t i = 0; i < inputNum_; i++) {
-    MatrixPtr input = getInputValue(i);
-    MatrixPtr weight = weights_[i]->getW();
-    size_t hsize = input->getHeight();
-    size_t wsize = weight->getHeight();
-    real scaleT = i == 0 ? real(0) : real(1);
-
-    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
-           !fullOutput_;
-    if (flag) {
-      // if the indecies are highly sparse,
-      // manully compute the multiplication of
-      // the input vector and the selected rows.
-      REGISTER_TIMER("selective.plain");
-      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-    } else {
-      // if the indecies is not sparse enough,
-      // use full mul instead
-      REGISTER_TIMER("selective.mul");
-      if (fullOutput_) {
-        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-      } else {
-        Matrix::resizeOrCreate(mmat_,
-                               hsize,
-                               wsize,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-        mmat_->mul(*input, *weight->getTranspose());
-        interOutput_->add3(mmat_);
-      }
-    }
-  }
-
-  if (biases_) {
-    interOutput_->addBias(*(biases_->getW()), 1);
-  }
-
-  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
-          !fullOutput_);
-  if (flag) {
-    // during generation, output of this layer is a sparse csr matrix,
-    // which is probably the input of maxid layer
-    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
-    // activiation of this layer should be exponential, not softmax.
-
-    Argument arg;
-    arg.value = Matrix::create(interOutput_->getData(),
-                               1,
-                               nnz,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
-    activation_->forward(arg).check();
-  } else /* train and test in train, not generating */ {
-    // during training, this layer output value is *Matrix*, which is input of
-    // eg. multi-class-cross-entropy
-
-    // while training, every sample has a equal number of selected
-    // columns to be activated.
-    // note indices of multi-class-cross-entropy need to be remapped
-    // to this index.
-    // e.g. sample = [1,3,5] and 3 is gold, then label is 1
-
-    forwardActivation();
-  }
-}
-
-void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
-  backwardActivation();
-  MatrixPtr oGrad = getOutputGrad();
-  if (!fullOutput_) {
-    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
-                                               interOutput_->getRows(),
-                                               interOutput_->getCols(),
-                                               interOutput_->getHeight(),
-                                               interOutput_->getWidth(),
-                                               interOutput_->getElementCnt(),
-                                               FLOAT_VALUE,
-                                               SPARSE_CSR,
-                                               /*trans=*/false,
-                                               /*useGpu=*/useGpu_);
-  } else {
-    interOutGrad_ = Matrix::create(oGrad->getData(),
-                                   oGrad->getHeight(),
-                                   oGrad->getWidth(),
-                                   /*trans=*/false,
-                                   /*useGpu=*/useGpu_);
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // backward is different from FullyConnectedLayer
-  // because the weight is transposed
-  for (size_t i = 0; i < inputNum_; i++) {
-    AsyncGpuBlock block;
-    MatrixPtr preGrad = getInputGrad(i);
-    if (preGrad) {
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
-    }
-
-    MatrixPtr wGrad = weights_[i]->getWGrad();
-    if (wGrad) {
-      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-      MatrixPtr input = getInputValue(i);
-      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
-    }
-
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates) {
-  if (candidates == nullptr) {
-    fillFullySelectiveData();
-    return;
-  }
-
-  size_t sampleNum = candidates->size();
-  size_t outputWidth = getSize();
-  size_t nnz =
-      std::accumulate(candidates->begin(),
-                      candidates->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-
-  Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
-                                     sampleNum,
-                                     outputWidth,
-                                     nnz,
-                                     NO_VALUE,
-                                     SPARSE_CSR,
-                                     false,
-                                     false);
-  CHECK(this->cpuSelCols_ != nullptr);
-  CpuSparseMatrixPtr selCols =
-      std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
-  int* rowOffsets = selCols->getRows();
-  int* colIndices = selCols->getCols();
-
-  rowOffsets[0] = 0;
-  int idx = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    if ((*candidates)[i].second > 0) {
-      rowOffsets[i + 1] = rowOffsets[i] + (*candidates)[i].second;
-      for (size_t j = 0; j < (*candidates)[i].second; ++j) {
-        colIndices[idx] = (*candidates)[i].first[j];
-        idx++;
-      }
-    } else {
-      rowOffsets[i + 1] = rowOffsets[i];
-    }
-  }
-
-  CHECK_EQ(static_cast<size_t>(rowOffsets[sampleNum]), nnz);
-  if (!useGpu_) {
-    this->selCols_ = this->cpuSelCols_;
-  } else {
-    Matrix::resizeOrCreateSparseMatrix(this->selCols_,
-                                       sampleNum,
-                                       outputWidth,
-                                       nnz,
-                                       NO_VALUE,
-                                       SPARSE_CSR,
-                                       false,
-                                       true);
-    this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-  }
-
-  fullOutput_ = false;
-}
-
-void paddle::SelectiveFullyConnectedLayer::getSelectiveCols() {
-  if (config_.has_selected_colums()) {
-    this->selCols_ = inputLayers_[inputNum_]->getOutputValue();
-    fullOutput_ = false;
-  } else if (!config_.selective_fc_pass_generation() || selCols_ == nullptr) {
-    this->fillFullySelectiveData();
-  }  // else selCols_ is initialized by fillSelectiveData
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
deleted file mode 100644
index 3ba04d9b2..000000000
--- a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief The SelectiveFullyConnectedLayer class
- *
- * SelectiveFullyConnectedLayer differs from FullyConnectedLayer by that it
- * requires an additional input to indicate several selected columns, and only
- * compute the multiplications between the input matrices and the selected
- * columns of the parameter matrices of this layer. If the selected columns is
- * not specified, SelectiveFullyConnected layer acts exactly like
- * FullyConnectedLayer.
- *
- * The config file api is selective_fc_layer.
- */
-class SelectiveFullyConnectedLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- private:
-  /**
-   * Get selected columns each forward.
-   */
-  void getSelectiveCols();
-
-  MatrixPtr mmat_;
-  /// cpuSelCols_ is a CpuSparseMatrix, used to save selected columns.
-  MatrixPtr cpuSelCols_;
-  /// CpuSparseMatrix or GpuSparseMatrix. In CPU mode, selCols_ points
-  /// to cpuSelCols_.
-  MatrixPtr selCols_;
-  size_t inputNum_;
-
-  /// interOutput_ shared same memory with output_.value.
-  MatrixPtr interOutput_;
-
-  /// if fullOutput_ is false, interOutGrad_ sparse matrix
-  MatrixPtr interOutGrad_;
-
-  /// if true, means output_.value is the same as Fc Layer
-  bool fullOutput_;
-
- public:
-  explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
-      : Layer(config), selCols_(nullptr) {}
-
-  ~SelectiveFullyConnectedLayer() {}
-  void prefetch() override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  /**
-   * @brief Resize the output matrix size.
-   * And reset value to zero
-   */
-  void reserveOutput(size_t height, size_t width, size_t nnz);
-
-  /**
-   * @brief Fill candidates to select several activations as output.
-   * @param candidates specifies several selected columns of the parameter
-   * matrices of this layer.
-   * Multiplications only between the input matrices and the selected columns
-   * are computed.
-   * If the candidates is a nullptr, selective fc layer acts exactly like the
-   * fully connected layer.
-   * @note CURRENTLY, THIS METHOD IS ONLY USED FOR BEAM SEARCH
-   */
-  void fillSelectiveData(
-      const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /**
-   * @brief Make SelectiveFC act as FullyConnectedLayer
-   */
-  void fillFullySelectiveData() { fullOutput_ = true; }
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
deleted file mode 100644
index 7b598e11a..000000000
--- a/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for concatenating the first sequence with the second sequence
- * Input: two sequences each containing the same number of instances
- *        seq1 = [a1, a2, ..., an]
- *        seq2 = [b1, b2, ..., bn]
- * Output: a concatenated sequence of the two input sequences
- *        out = [a1, b1, a2, b2, ..., an, bn]
- */
-
-class SequenceConcatLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~SequenceConcatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqconcat, SequenceConcatLayer);
-
-bool SequenceConcatLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(2U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceConcatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input1 = getInput(0);
-  size_t numSequences1 = input1.getNumSequences();
-  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
-
-  const Argument& input2 = getInput(1);
-  size_t numSequences2 = input2.getNumSequences();
-  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input1.value->getWidth());
-  CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(dim, input2.value->getWidth());
-  CHECK_EQ(startPositions2->getData()[numSequences2], input2.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  MatrixPtr inputValue1 = getInputValue(0);
-  MatrixPtr inputValue2 = getInputValue(1);
-
-  // reset output
-  reserveOutput(inputValue1->getHeight() + inputValue2->getHeight(), dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerForward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      outputValue->subMatrix(offset, leftNumIns)
-          ->assign(*(inputValue1->subMatrix(starts1[seqId], leftNumIns)));
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      outputValue->subMatrix(offset, rightNumIns)
-          ->assign(*(inputValue2->subMatrix(starts2[seqId], rightNumIns)));
-      offset += rightNumIns;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences1 + 1; ++seqId) {
-      tgtBuf[seqId] = starts1[seqId] + starts2[seqId];
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceConcatLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr inputGrad2 = getInputGrad(1);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
-
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  size_t numSequences2 = startPositions2->getSize() - 1;
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerBackward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      if (inputGrad1) {
-        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
-            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
-      }
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      if (inputGrad2) {
-        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
-            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
-      }
-      offset += rightNumIns;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
deleted file mode 100644
index 8735d71ba..000000000
--- a/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "SequencePoolLayer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for extracting the last instance of the input sequence.
- * Input: a sequence
- * If SequenceLevel = kNonseq:
- *   Output: a sequence containing only the last instance of the input sequence
- *   If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and getting last instance
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *   Check input sequence must has sub-sequence
- *   Output: a sequence containing only the last instance of each sub-sequence
- *           of the input sequence
- *
- * The config file api is last_seq and first_seq.
- */
-
-class SequenceLastInstanceLayer : public SequencePoolLayer {
- protected:
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-  std::vector<int> instanceIds_;
-
- public:
-  explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
-
-bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  SequencePoolLayer::init(layerMap, parameterMap);
-  reversed_ = config_.select_first();
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  return true;
-}
-
-void SequenceLastInstanceLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  auto starts = startPositions_->getData(false);
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
-
-    instanceIds_.clear();
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
-      instanceIds_.push_back(insId);
-
-      outputValue->subMatrix(seqId, 1, tmpDest_)
-          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    outputValue->addBias(*(biases_->getW()), 1);
-  }
-
-  /*  activation, should set to 'linear' in most cases */
-  forwardActivation();
-}
-
-void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  SequencePoolLayer::backward(callback);
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputGrad) {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
-
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
-          ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.cpp b/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
deleted file mode 100644
index 243b795db..000000000
--- a/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SequencePoolLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-bool SequencePoolLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // seqlastins/max/average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  stride_ = config_.seq_pool_stride();
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequencePoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.hasSeq() || input.hasSubseq())
-      << "Input should be a sequence or subsequence for layer " << getName();
-
-  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  size_t dim = getSize();
-  // check
-  CHECK_EQ(dim, input.value->getWidth());
-  startPositions_ =
-      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
-  auto starts = startPositions_->getVector(false);
-  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
-  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-   */
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-        << "when trans_type = seq, input must hasSubseq";
-    output_.degradeSequence(input);
-  }
-  if (stride_ > 0) {
-    CHECK_EQ(input.hasSubseq(), 0UL)
-        << "sequence stride pooling is invalid for hasSubseq now";
-    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
-    newBatchSize_ = startPositions_->getSize() - 1;
-  }
-
-  resetOutput(newBatchSize_, dim);
-}
-
-void SequencePoolLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.h b/paddle/legacy/gserver/layers/SequencePoolLayer.h
deleted file mode 100644
index 1c019b313..000000000
--- a/paddle/legacy/gserver/layers/SequencePoolLayer.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
- *
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sequence}{input[i]}
- *    If stride_ > 0:
- *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence. Stride is the step size by which we slide
- *                a window upon the input sequence, and the pooling operation
- *                is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class SequencePoolLayer : public Layer {
- protected:
-  int type_;
-  std::unique_ptr<Weight> biases_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  size_t newBatchSize_;
-  ICpuGpuVectorPtr startPositions_;
-  int stride_;
-  // Whether the input sequence is reversed or not.
-  bool reversed_ = false;
-
- public:
-  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
deleted file mode 100644
index e3d40cab5..000000000
--- a/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *  A layer for reshaping the sequence. Assume the input sequence has
- *  T instances, the dimension of each instance is M, and the input
- *  reshape_dim is N, then the output sequence has T*M/N instances,
- *  the dimension of each instance is N.
- *
- *  Note that T*M/N must be an integer.
- */
-
-class SequenceReshapeLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-
-  MatrixPtr reshapedOutputGrad;
-
- public:
-  explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
-
-bool SequenceReshapeLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceReshapeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-
-  size_t inDim = input.value->getWidth();
-  size_t outDim = getSize();
-
-  size_t numSequences = input.getNumSequences();
-
-  // by default, we assume each instance as a sequence
-  IVectorPtr seqStarts;
-  IVector::resizeOrCreate(seqStarts, input.getBatchSize() + 1, false);
-  int* startsData = seqStarts->getData();
-  for (int i = 0; i < input.getBatchSize() + 1; i++) {
-    startsData[i] = i;
-  }
-  const int* starts = startsData;
-
-  // if there is sequence, then use start positions
-  if (input.sequenceStartPositions) {
-    auto startPositions = input.sequenceStartPositions->getVector(false);
-    starts = startPositions->getData();
-    CHECK_EQ(starts[numSequences], input.getBatchSize());
-    CHECK_EQ(numSequences, startPositions->getSize() - 1);
-  }
-
-  for (size_t seqID = 0; seqID < numSequences; seqID++) {
-    size_t inNumIns = starts[seqID + 1] - starts[seqID];
-    size_t outNumIns = inNumIns * inDim / outDim;
-    CHECK_EQ(outNumIns * outDim, inNumIns * inDim);
-  }
-
-  MatrixPtr inputValue = getInputValue(0);
-
-  // reset output
-  reserveOutput(inputValue->getHeight() * inDim / outDim, outDim);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceReshapeLayerForward", getName().c_str());
-
-    outputValue->copyFrom(*inputValue);
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences + 1; ++seqId) {
-      tgtBuf[seqId] = starts[seqId] * inDim / outDim;
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  AsyncGpuBlock asyncGpuBlock;
-  REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
-
-  if (inputGrad) {
-    Matrix::resizeOrCreate(reshapedOutputGrad,
-                           inputGrad->getHeight(),
-                           inputGrad->getWidth(),
-                           false,
-                           useGpu_);
-    reshapedOutputGrad->copyFrom(*outputGrad);
-    inputGrad->add(*reshapedOutputGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
deleted file mode 100644
index 3ed51c4ef..000000000
--- a/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-class SequenceSliceLayer : public Layer {
- public:
-  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second and the (optional) third input which are some
-   * selected indices of the give sequence to trim the sequence, are actually
-   * filled with int types so that storing int types information in real number
-   * matrices is very dangerous, since real numbers will be convered to int
-   * types. If a user fills this matrix himself, invalid data may occor.
-   */
-
-  MatrixPtr startIdsOnCpu_;
-  MatrixPtr endIdsOnCpu_;
-
-  std::vector<int> selectedRows_;
-  IVectorPtr rowIndice_;
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-  std::vector<int> outSubSeqStartPos_;
-  std::vector<int> outSeqStartPos_;
-
-  void checkInputs();
-  void copySliceIdsToCpu();
-  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
-};
-
-REGISTER_LAYER(seq_slice, SequenceSliceLayer);
-
-bool SequenceSliceLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_GE(inputLayers_.size(), 2U);
-  CHECK_LE(inputLayers_.size(), 3U);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceSliceLayer::checkInputs() {
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
-                           << "must be a sequence.";
-  const MatrixPtr indices1 = getInputValue(1);
-  CHECK_EQ(
-      indices1->getHeight(),
-      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
-                                               : inputSeq.getNumSequences()))
-      << "Height of the second input should be equal to number of sequence "
-      << "in the first input.";
-  if (inputLayers_.size() == 3) {
-    const MatrixPtr indices2 = getInputValue(2);
-    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
-        << "start indices and end indices should have the same height.";
-    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
-        << "start indices and end indices should have the same Width.";
-  }
-}
-
-void SequenceSliceLayer::copySliceIdsToCpu() {
-  const MatrixPtr indices1 = getInputValue(1);
-  if (inputLayers_.size() == 2U) {
-    if (config_.select_first()) {
-      Matrix::resizeOrCreate(startIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      startIdsOnCpu_->copyFrom(*indices1);
-      endIdsOnCpu_ = nullptr;
-    } else {
-      Matrix::resizeOrCreate(endIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      endIdsOnCpu_->copyFrom(*indices1);
-      startIdsOnCpu_ = nullptr;
-    }
-  } else if (inputLayers_.size() == 3U) {
-    Matrix::resizeOrCreate(startIdsOnCpu_,
-                           indices1->getHeight(),
-                           indices1->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    startIdsOnCpu_->copyFrom(*indices1);
-
-    const MatrixPtr indices2 = getInputValue(2);
-    Matrix::resizeOrCreate(endIdsOnCpu_,
-                           indices2->getHeight(),
-                           indices2->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    endIdsOnCpu_->copyFrom(*indices2);
-  }
-}
-
-void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
-                                         const MatrixPtr ends) {
-  CHECK(starts || ends) << "At least one of the start or end indices "
-                        << "should be given.";
-
-  bool hasSubseq = getInput(0).hasSubseq();
-
-  outSeqStartPos_.resize(1, 0);
-  outSubSeqStartPos_.resize(1, 0);
-  selectedRows_.clear();
-
-  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
-  size_t rowIdx = 0;
-  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
-    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
-      for (size_t k = 0; k < beamSize; ++k) {
-        if (starts && starts->getElement(rowIdx, k) == -1.) break;
-        if (ends && ends->getElement(rowIdx, k) == -1.) break;
-
-        int begPos = inputSeqInfoVec_[i][j];
-        if (starts) begPos += starts->getElement(rowIdx, k);
-
-        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
-        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
-
-        int seqLen = endPos - begPos + 1;
-        CHECK_GT(seqLen, 0);
-        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
-        hasSubseq
-            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
-            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
-      }
-      rowIdx++;
-    }
-    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
-
-  if (hasSubseq) {
-    ICpuGpuVector::resizeOrCreate(
-        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
-    output_.subSequenceStartPositions->copyFrom(
-        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
-  }
-}
-
-void SequenceSliceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  checkInputs();
-
-  const Argument& inputSeq = getInput(0);
-  inputSeqInfoVec_.clear();
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  if (!useGpu_) {
-    if (inputLayers_.size() == 2U) {
-      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
-      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
-    } else if (inputLayers_.size() == 3U) {
-      startIdsOnCpu_ = getInputValue(1);
-      endIdsOnCpu_ = getInputValue(2);
-    }
-  } else {
-    copySliceIdsToCpu();
-  }
-
-  /*
-   * calculate the selected row indices in a batch, and build the output
-   * sequence information.
-   */
-  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
-
-  resetOutput(selectedRows_.size(), getSize());
-
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SequenceSliceLayer::backward(const UpdateCallback& callback) {
-  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceToBatch.cpp b/paddle/legacy/gserver/layers/SequenceToBatch.cpp
deleted file mode 100644
index 5d0d588e6..000000000
--- a/paddle/legacy/gserver/layers/SequenceToBatch.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SequenceToBatch.h"
-#include <string.h>
-#include <algorithm>
-#include <iostream>
-#include <vector>
-
-namespace paddle {
-
-void SequenceToBatch::resizeOrCreateBatch(int batchSize,
-                                          size_t numSequences,
-                                          const int *seqStarts,
-                                          bool reversed,
-                                          bool prevBatchState) {
-  CHECK_EQ(seqStarts[numSequences], batchSize);
-  IVector::resizeOrCreate(seq2BatchIdx_, batchSize, useGpu_);
-  if (!useGpu_) {
-    cpuSeq2BatchIdx_ = seq2BatchIdx_;
-  } else {
-    IVector::resizeOrCreate(cpuSeq2BatchIdx_, batchSize, false);
-  }
-
-  /*
-   * calculate the length of each sequence & sort sequence index by the length
-   * Exampel:  Sequences = {s0, s1, s2}
-   *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-   *           seqStartAndLength[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
-   */
-  struct SeqStartAndLength {
-    int start_;
-    int length_;
-    int seqIdx_;
-    SeqStartAndLength(int start, int length, int seqIdx)
-        : start_(start), length_(length), seqIdx_(seqIdx) {}
-  };
-  std::vector<SeqStartAndLength> seqStartAndLength;
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    int length = seqStarts[seqId + 1] - seqStarts[seqId];
-    seqStartAndLength.emplace_back(seqStarts[seqId], length, seqId);
-  }
-  std::sort(seqStartAndLength.begin(),
-            seqStartAndLength.end(),
-            [](SeqStartAndLength a, SeqStartAndLength b) {
-              return a.length_ > b.length_;
-            });
-
-  /*
-   * calculate the start position of each batch
-   * (numBatch equal the maxLength of sequences)
-   * Exampel:  Sequences = {s0, s1, s2}
-   *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-   *           numBatch = 5,
-   *           batchIndex = {b0, b1, b2, b3, b4}
-   *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
-   *           batchStartPositions[6] = {0, 3, 6, 9, 11, 12}
-   */
-  numBatch_ = (size_t)seqStartAndLength[0].length_;
-
-  IVector::resizeOrCreate(batchStartPositions_, numBatch_ + 1, false);
-  int *batchStartPositions = batchStartPositions_->getData();
-  batchStartPositions[0] = 0;
-  for (size_t n = 0; n < numBatch_; n++) {
-    int batchId = batchStartPositions[n];
-    for (size_t i = 0; i < seqStartAndLength.size(); ++i) {
-      size_t seqLength = seqStartAndLength[i].length_;
-      int start = seqStartAndLength[i].start_;
-      if (n < seqLength) {
-        if (!reversed) {
-          cpuSeq2BatchIdx_->getData()[batchId] = start + n;
-        } else {
-          cpuSeq2BatchIdx_->getData()[batchId] = start + seqLength - 1 - n;
-        }
-        batchId++;
-      } else {
-        break;
-      }
-    }
-    batchStartPositions[n + 1] = batchId;
-  }
-  if (useGpu_) {
-    seq2BatchIdx_->copyFrom(*cpuSeq2BatchIdx_);
-  }
-  if (prevBatchState) {
-    IVector::resizeOrCreate(seqIdx_, numSequences, useGpu_);
-    IVector::resizeOrCreate(seqEndIdxInBatch_, numSequences, useGpu_);
-    if (!useGpu_) {
-      cpuSeqIdx_ = seqIdx_;
-      cpuSeqEndIdxInBatch_ = seqEndIdxInBatch_;
-    } else {
-      IVector::resizeOrCreate(cpuSeqIdx_, numSequences, false);
-      IVector::resizeOrCreate(cpuSeqEndIdxInBatch_, numSequences, false);
-    }
-    int *seqIdx = cpuSeqIdx_->getData();
-    int *seqEndIdxInBatch = cpuSeqEndIdxInBatch_->getData();
-    for (size_t i = 0; i < seqStartAndLength.size(); ++i) {
-      seqIdx[i] = seqStartAndLength[i].seqIdx_;
-    }
-    for (size_t i = 0; i < seqStartAndLength.size(); ++i) {
-      if (seqStartAndLength[i].length_ > 0) {
-        seqEndIdxInBatch[seqStartAndLength[i].seqIdx_] =
-            batchStartPositions[seqStartAndLength[i].length_ - 1] + i;
-      } else {
-        seqEndIdxInBatch[seqStartAndLength[i].seqIdx_] = 0;
-      }
-    }
-    if (useGpu_) {
-      seqIdx_->copyFrom(*cpuSeqIdx_);
-      seqEndIdxInBatch_->copyFrom(*cpuSeqEndIdxInBatch_);
-    }
-  }
-}
-
-void SequenceToBatch::resizeOrCreate(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_,
-                         seqValue.getHeight(),
-                         seqValue.getWidth(),
-                         /* trans= */ false,
-                         useGpu_);
-}
-
-MatrixPtr SequenceToBatch::getBatchValue(int batchId, int numRows) {
-  return getBatchValue(*batchValue_, batchId, numRows);
-}
-
-MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue,
-                                         int batchId,
-                                         int numRows) {
-  int *batchStartPositions = batchStartPositions_->getData();
-  int start = batchStartPositions[batchId];
-  int maxRows = batchStartPositions[batchId + 1] - batchStartPositions[batchId];
-  if (numRows == 0) {
-    numRows = maxRows;
-  } else {
-    CHECK_LE(numRows, maxRows);
-  }
-  return batchValue.subMatrix(start, numRows);
-}
-
-void SequenceToBatch::prevOutput2Batch(Matrix &src, Matrix &dst) {
-  sequence2BatchCopy(dst, src, *seqIdx_, true);
-}
-
-void SequenceToBatch::getSeqOutputFromBatch(Matrix &sequence, Matrix &batch) {
-  sequence2BatchCopy(sequence, batch, *seqEndIdxInBatch_, true);
-}
-
-void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
-                                         Matrix &sequence,
-                                         IVector &seq2BatchIdx,
-                                         bool seq2batch) {
-  int seqWidth = sequence.getWidth();
-  int batchCount = batch.getHeight();
-  real *batchData = batch.getData();
-  real *seqData = sequence.getData();
-  int *idxData = seq2BatchIdx.getData();
-
-  if (useGpu_) {
-    hl_sequence2batch_copy(
-        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
-  } else {
-    if (seq2batch) {
-#ifdef PADDLE_USE_MKLML
-      const int blockMemSize = 8 * 1024;
-      const int blockSize = blockMemSize / sizeof(real);
-#pragma omp parallel for collapse(2)
-      for (int i = 0; i < batchCount; ++i) {
-        for (int j = 0; j < seqWidth; j += blockSize) {
-          memcpy(batch.rowBuf(i) + j,
-                 sequence.rowBuf(idxData[i]) + j,
-                 (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
-                                            : blockMemSize);
-        }
-      }
-#else
-      for (int i = 0; i < batchCount; ++i) {
-        memcpy(batch.rowBuf(i),
-               sequence.rowBuf(idxData[i]),
-               seqWidth * sizeof(real));
-      }
-#endif
-    } else {
-#ifdef PADDLE_USE_MKLML
-#pragma omp parallel for
-#endif
-      for (int i = 0; i < batchCount; ++i) {
-        memcpy(sequence.rowBuf(idxData[i]),
-               batch.rowBuf(i),
-               seqWidth * sizeof(real));
-      }
-    }
-  }
-}
-
-void SequenceToBatch::sequence2BatchAdd(Matrix &batch,
-                                        Matrix &sequence,
-                                        IVector &seq2BatchIdx,
-                                        bool seq2batch) {
-  int seqWidth = sequence.getWidth();
-  int batchCount = batch.getHeight();
-  real *batchData = batch.getData();
-  real *seqData = sequence.getData();
-  int *idxData = seq2BatchIdx.getData();
-
-  if (useGpu_) {
-    hl_sequence2batch_add(
-        batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
-  } else {
-    for (int i = 0; i < batchCount; ++i) {
-      if (seq2batch) {
-        batch.subMatrix(i, 1)->add(*sequence.subMatrix(idxData[i], 1));
-      } else {
-        sequence.subMatrix(idxData[i], 1)->add(*batch.subMatrix(i, 1));
-      }
-    }
-  }
-}
-
-void SequenceToBatch::copyFromSeq(Matrix &seqValue) {
-  Matrix::resizeOrCreate(batchValue_,
-                         seqValue.getHeight(),
-                         seqValue.getWidth(),
-                         /* trans= */ false,
-                         useGpu_);
-  sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, true);
-}
-
-void SequenceToBatch::copyBackSeq(Matrix &seqValue) {
-  sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, false);
-}
-
-void SequenceToBatch::copy(Matrix &seqValue,
-                           Matrix &batchValue,
-                           bool seq2batch) {
-  sequence2BatchCopy(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
-}
-
-void SequenceToBatch::add(Matrix &seqValue,
-                          Matrix &batchValue,
-                          bool seq2batch) {
-  sequence2BatchAdd(batchValue, seqValue, *seq2BatchIdx_, seq2batch);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceToBatch.h b/paddle/legacy/gserver/layers/SequenceToBatch.h
deleted file mode 100644
index 7ed517937..000000000
--- a/paddle/legacy/gserver/layers/SequenceToBatch.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-
-/*
- * This class can used to modify the matrix structure of sequence matrix into
- * batch structure.
- * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t]
- * batch matrix:    [C1_s ... C1_t | ...... | Cn_s ... Cn_t]
- * Cn_s is the state for sequence s at time n.
- *
- * Exampel:  sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}}
- *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
- *           batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}}
- *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
- *
- * Use:
- * Input: seqMatrix, seqStarts(Sequence Start Positions)
- * Output: batchMatrix
- * 1. SequenceToBatch seq2batch;
- * 2. seq2batch.resizeOrCreateBatch(seqStarts);     // calculate seq2BatchIdx
- * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix
- *
- */
-class SequenceToBatch {
- public:
-  explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
-
-  /* resize and calculate the batchIndex_ */
-  void resizeOrCreateBatch(int batchSize,
-                           size_t numSequences,
-                           const int *seqStarts,
-                           bool reversed,
-                           bool prevBatchState = false);
-
-  /* sequence matrix and batch matrix copy:
-   * seq2batch: copy(seqValue, batchValue, true);
-   * batch2seq: copy(seqValue, batchValue, false);
-   */
-  void copy(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  /* sequence/batch matrix add to batch/sequence matrix */
-  void add(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  MatrixPtr getBatchValue(Matrix &batchValue, int batchId, int numRows = 0);
-
-  size_t getNumBatch() const { return numBatch_; }
-
-  /* resize or create a batch matrix(batchValue_) */
-  void resizeOrCreate(Matrix &seqValue);
-  /* copy seqValue to batchValue_ */
-  void copyFromSeq(Matrix &seqValue);
-  /* copy batchValue_ to seqValue */
-  void copyBackSeq(Matrix &seqValue);
-  MatrixPtr getBatchValue(int batchId, int numRows = 0);
-  MatrixPtr getBatchValue() { return batchValue_; }
-  /*tranfer preBatchOutput to batch struct*/
-  void prevOutput2Batch(Matrix &src, Matrix &dst);
-  /*get sequence output from batch struct*/
-  void getSeqOutputFromBatch(Matrix &sequence, Matrix &batch);
-
-  /* Copy the index from another seq2batch. */
-  void shareIndexWith(const SequenceToBatch &seq2batch) {
-    CHECK(useGpu_ == seq2batch.useGpu_);
-    batchStartPositions_ = seq2batch.batchStartPositions_;
-    seq2BatchIdx_ = seq2batch.seq2BatchIdx_;
-    cpuSeq2BatchIdx_ = seq2batch.cpuSeq2BatchIdx_;
-    numBatch_ = seq2batch.numBatch_;
-  }
-
- protected:
-  void sequence2BatchCopy(Matrix &batch,
-                          Matrix &sequence,
-                          IVector &seq2BatchIdx,
-                          bool seq2batch);
-  void sequence2BatchAdd(Matrix &batch,
-                         Matrix &sequence,
-                         IVector &seq2BatchIdx,
-                         bool seq2batch);
-
-  IVectorPtr batchStartPositions_;
-  IVectorPtr seq2BatchIdx_;
-  IVectorPtr cpuSeq2BatchIdx_;
-  IVectorPtr cpuSeqIdx_;
-  IVectorPtr cpuSeqEndIdxInBatch_;
-  IVectorPtr seqIdx_;
-  IVectorPtr seqEndIdxInBatch_;
-  size_t numBatch_;
-  bool useGpu_;
-  MatrixPtr batchValue_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SliceProjection.cpp b/paddle/legacy/gserver/layers/SliceProjection.cpp
deleted file mode 100644
index b474f2db7..000000000
--- a/paddle/legacy/gserver/layers/SliceProjection.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * SliceProjection can slice the input value into multiple parts,
- * and then select some of them to merge into a new output.
- *
- * First, calculate the slices that need to be merged into the output.
- * slices = input.slices().for_output()
- *
- * Second, merge each slice into the output.
- * for(auto slice: slices) {
- *   out.addAtOffset(slice, offset);
- * }
- *
- * Input slices as output: s0, s1, ...:
- *   -----------------------
- *   |///|   |//////|      |
- *   |/s0|   |//s1//|      |
- *   |///|   |//////|      |
- *   -----------------------
- * Output, merge s0, s1, ... into one output:
- *   ----------------
- *   |///|//////|   |
- *   |/s0|//s1//|...|
- *   |///|//////|   |
- *   ----------------
- *
- * The config file api is slice_projection.
- */
-class SliceProjection : public Projection {
- public:
-  SliceProjection(const ProjectionConfig& config,
-                  const ParameterPtr& parameter,
-                  bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::vector<std::pair<size_t, size_t>> slices_;
-};
-
-REGISTER_PROJECTION(slice, SliceProjection);
-
-/**
- * Constructed function.
- * @note SliceProjection should not have any parameter.
- */
-SliceProjection::SliceProjection(const ProjectionConfig& config,
-                                 const ParameterPtr& parameter,
-                                 bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'slice' projection should not have any parameter";
-
-  slices_.reserve(config.slices_size());
-  for (const auto& slice : config.slices()) {
-    slices_.push_back(std::make_pair(slice.start(), slice.end()));
-  }
-}
-
-void SliceProjection::forward() {
-  size_t offset = 0;
-  for (auto& slice : slices_) {
-    auto slice_out = in_->value->subColMatrix(slice.first, slice.second);
-    out_->value->addAtOffset(*slice_out, offset);
-    offset += slice_out->getWidth();
-  }
-}
-
-void SliceProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    size_t offset = 0;
-    for (auto& slice : slices_) {
-      auto slice_out = in_->grad->subColMatrix(slice.first, slice.second);
-      slice_out->addAtOffset(*out_->grad, offset);
-      offset += slice_out->getWidth();
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
deleted file mode 100644
index 9168fd7dd..000000000
--- a/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for applying a slope and an intercept to the input
- * element-wise.
- * This layer is used in NEURAL TURING MACHINE.
- * @note There is no activation and weight in this layer.
- *
- * \f[
- *    y = ax + b
- * \f]
- *
- * Here, a is scale and b is offset, which are provided as attributes of the
- * layer.
- *
- * The config file api is slope_intercept_layer.
- */
-
-class SlopeInterceptLayer : public Layer {
- public:
-  explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
-
-bool SlopeInterceptLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SlopeInterceptLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t size = getSize();
-
-  CHECK_EQ(size, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
-    outV->mulScalar(*inV, config_.slope());
-    outV->add(config_.intercept());
-  }
-}
-
-void SlopeInterceptLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSlopeInterceptTimer", getName().c_str());
-    inG->add(*outG, config_.slope());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
deleted file mode 100644
index b445a399e..000000000
--- a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SpatialPyramidPoolLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(spp, SpatialPyramidPoolLayer);
-
-ProjectionConfig SpatialPyramidPoolLayer::getConfig(size_t imgSizeW,
-                                                    size_t imgSizeH,
-                                                    size_t channels,
-                                                    size_t pyramidLevel,
-                                                    std::string& poolType) {
-  ProjectionConfig config;
-  config.set_type("pool");
-  PoolConfig* conf = config.mutable_pool_conf();
-  conf->set_channels(channels);
-  conf->set_img_size(imgSizeW);
-  conf->set_img_size_y(imgSizeH);
-  conf->set_pool_type(poolType);
-
-  int numBins = std::pow(2, pyramidLevel);
-
-  int sizeH = std::ceil(imgSizeH / static_cast<double>(numBins));
-  int paddingH = (sizeH * numBins - imgSizeH + 1) / 2;
-  int outSizeH = outputSize(imgSizeH, sizeH, paddingH, sizeH, true);
-
-  int sizeW = std::ceil(imgSizeW / static_cast<double>(numBins));
-  int paddingW = (sizeW * numBins - imgSizeW + 1) / 2;
-  int outSizeW = outputSize(imgSizeW, sizeW, paddingW, sizeW, true);
-
-  conf->set_stride(sizeW);
-  conf->set_stride_y(sizeH);
-  conf->set_size_x(sizeW);
-  conf->set_size_y(sizeH);
-  conf->set_padding(paddingW);
-  conf->set_padding_y(paddingH);
-  conf->set_output_x(outSizeW);
-  conf->set_output_y(outSizeH);
-  config.set_output_size(outSizeH * outSizeW * channels);
-  return config;
-}
-
-size_t SpatialPyramidPoolLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  const ImageConfig& conf = config_.inputs(0).spp_conf().image_conf();
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = conf.img_size();
-  }
-
-  size_t outputH = 1;
-  size_t outputW = (std::pow(4, pyramidHeight_) - 1) / (4 - 1);
-
-  layerSize = outputH * outputW * channels_;
-  return layerSize;
-}
-
-bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap,
-                                   const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const SppConfig& sppConf = config_.inputs(0).spp_conf();
-  pyramidHeight_ = sppConf.pyramid_height();
-  poolType_ = sppConf.pool_type();
-
-  const ImageConfig& imageConf = sppConf.image_conf();
-  channels_ = imageConf.channels();
-  imgSizeW_ = imageConf.img_size();
-  imgSizeH_ = imageConf.has_img_size_y() ? imageConf.img_size_y() : imgSizeW_;
-  poolProjections_.reserve(pyramidHeight_);
-  projCol_.reserve(pyramidHeight_);
-  projOutput_.resize(pyramidHeight_);
-
-  size_t startCol = 0;
-  size_t endCol = 0;
-  for (size_t i = 0; i < pyramidHeight_; i++) {
-    poolProjections_.emplace_back(PoolProjection::create(
-        getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_),
-        nullptr,
-        useGpu_));
-    endCol += poolProjections_[i]->getOutputSize();
-    projCol_.push_back(std::make_pair(startCol, endCol));
-    startCol = endCol;
-  }
-  CHECK_EQ(endCol, getSize());
-  return true;
-}
-
-void SpatialPyramidPoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  resetOutput(batchSize, getSize());
-  for (size_t i = 0; i < pyramidHeight_; i++) {
-    size_t startCol = projCol_[i].first;
-    size_t endCol = projCol_[i].second;
-    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
-    if (output_.grad) {
-      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
-    }
-  }
-  for (size_t i = 0; i < pyramidHeight_; i++) {
-    poolProjections_[i]->forward(&getInput(0), &projOutput_[i], passType);
-  }
-}
-
-void SpatialPyramidPoolLayer::backward(const UpdateCallback& callback) {
-  for (size_t i = 0; i < pyramidHeight_; i++) {
-    if (poolProjections_[i]) {
-      poolProjections_[i]->backward(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
deleted file mode 100644
index 6d8ed9c87..000000000
--- a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "PoolProjection.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-/**
- * @brief A layer for spatial pyramid pooling on the input image by taking
- * the max, average, etc. within regions, so that the result vector of
- * different sized images are of the same size.
- *
- * The config file api is spp_layer.
- */
-
-class SpatialPyramidPoolLayer : public Layer {
- protected:
-  size_t channels_;
-  size_t imgSizeW_;
-  size_t imgSizeH_;
-  size_t pyramidHeight_;
-  std::string poolType_;
-
-  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
-  std::vector<Argument> projOutput_;
-  std::vector<std::pair<size_t, size_t>> projCol_;
-
- public:
-  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  ProjectionConfig getConfig(size_t sizeX_,
-                             size_t sizeY_,
-                             size_t channels,
-                             size_t pyamidLevel_,
-                             std::string& poolType_);
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
deleted file mode 100644
index f363c2ac8..000000000
--- a/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-class SubNestedSequenceLayer : public Layer {
- public:
-  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
- private:
-  /*
-   * This functions generates the indices of rows in a batch according to the
-   * indices of selected sub-sequence in each sequence.
-   *
-   * Examples:
-   * selectedIndices:
-   *   [
-   *     [0, 1, -1],
-   *     [0, 1, 2],
-   *     [0, -1, -1],
-   *     [0, 2, 3],
-   *   ]
-   * inputSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   *
-   * ths output is saved to private member rowIndice_;
-   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
-   */
-
-  void calSelectedRows(const MatrixPtr selectedIndices,
-                       const std::vector<std::vector<int>>& inputSeqInfo);
-
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second is some selected indices of the give sequence to trim
-   * the nested sequence, are actually filled with int types so that storing
-   * int types information in real number matrices is very dangerous, since
-   * real numbers will be convered to int types. If a user fills this matrix
-   * himself, invalid data may occor.
-   *
-   * if the second input of this layer is on GPU memory, copy it to CPU memory.
-   */
-  MatrixPtr selIdsCpu_;
-
-  /*
-   * reorganize sequenceStartPositions and subSequenceStartPositions
-   * into a 2d vector to facilitate the sequence selection process.
-   */
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-
-  /* store the final selected row indices in a batch */
-  IVectorPtr rowIndice_;
-  /* rowIndice_ and selectedRows_ actually share a same memory. */
-  std::vector<int> selectedRows_;
-};
-
-REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
-
-bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(2U, inputLayers_.size());
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubNestedSequenceLayer::calSelectedRows(
-    const MatrixPtr selectedIndices,
-    const std::vector<std::vector<int>>& inputSeqInfo) {
-  selectedRows_.clear();
-
-  std::vector<int> outSeqStartInfo(1, 0);
-  std::vector<int> outSubSeqStartInfo(1, 0);
-
-  size_t seqNum = selectedIndices->getHeight();
-  size_t beamSize = selectedIndices->getWidth();
-  for (size_t i = 0; i < seqNum; ++i) {
-    for (size_t j = 0; j < beamSize; ++j) {
-      if (selectedIndices->getElement(i, j) == -1.) break;
-      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
-      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
-
-      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
-                         inputSeqInfoVec_[i][selSubSeqIdx];
-      for (size_t k = 0; k < subSeqLen; ++k)
-        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
-      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
-    }
-    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
-
-  ICpuGpuVector::resizeOrCreate(
-      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
-  output_.subSequenceStartPositions->copyFrom(
-      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
-}
-
-void SubNestedSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
-                              << "must be a nested sequence.";
-  const MatrixPtr selectedIndices = getInputValue(1);
-  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
-
-  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
-    /*
-     * Currently, the second input for this layer is generated by
-     * kmax_sequence_score_layer whose output is always stored on CPU,
-     * or a data_layer which canbe on GPU.
-     *
-     * If the second input is on GPU, copy it to CPU memory, because this
-     * input always uses very few memory, and operations related to it are
-     * all logic control, not computations.
-     */
-    Matrix::resizeOrCreate(selIdsCpu_,
-                           selectedIndices->getHeight(),
-                           selectedIndices->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    selIdsCpu_->copyFrom(*selectedIndices);
-  } else {
-    selIdsCpu_ = selectedIndices;
-  }
-
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
-
-  resetOutput(selectedRows_.size(), getSize());
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inputSeqGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
deleted file mode 100644
index 36796f047..000000000
--- a/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for taking the subsequence according to given offset and size
- * Input: original sequence, offset, size
- * Output: subsequence
- */
-
-class SubSequenceLayer : public Layer {
- protected:
-  std::unique_ptr<Weight> biases_;
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-
- public:
-  explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(subseq, SubSequenceLayer);
-
-bool SubSequenceLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(3U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input = getInput(0);
-  size_t numSequences1 = input.getNumSequences();
-  auto startPositions1 = input.sequenceStartPositions->getVector(false);
-
-  const Argument& offsetSeq = getInput(1);
-  size_t numSequences2 = offsetSeq.getNumSequences();
-  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
-
-  const Argument& sizeSeq = getInput(2);
-  size_t numSequences3 = sizeSeq.getNumSequences();
-  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input.value->getWidth());
-
-  CHECK_EQ(startPositions1->getData()[numSequences1], input.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(startPositions2->getData()[numSequences2], offsetSeq.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(startPositions3->getData()[numSequences3], sizeSeq.getBatchSize());
-  CHECK_EQ(numSequences3, startPositions3->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-  CHECK_EQ(numSequences2, numSequences3);
-
-  MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  CHECK_EQ(offsetValue->getSize(), numSequences1);
-  CHECK_EQ(sizeValue->getSize(), numSequences1);
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-
-  // get total height of output
-  size_t height = 0;
-  for (size_t seqId = 0; seqId < numSequences1; seqId++) {
-    height += sizes[seqId];
-  }
-
-  // reset output
-  resetOutput(height, dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerForward", getName().c_str());
-
-    size_t offsetIn = 0;
-    size_t offsetOut = 0;
-    size_t size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      outputValue->subMatrix(offsetOut, size, tmpDest_)
-          ->assign(*(inputValue->subMatrix(offsetIn, size, tmpSrc_)));
-
-      offsetOut += size;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-    int offset = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      tgtBuf[seqId] = offset;
-      offset += sizes[seqId];
-    }
-    tgtBuf[numSequences1] = offset;
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SubSequenceLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  const int* starts1 = startPositions1->getData();
-
-  const Argument& offsetSeq = getInput(1);
-  const Argument& sizeSeq = getInput(2);
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerBackward", getName().c_str());
-
-    int offsetIn = 0;
-    int offsetOut = 0;
-    int size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      inputGrad1->subMatrix(offsetIn, size, tmpDest_)
-          ->add(*(outputGrad->subMatrix(offsetOut, size, tmpSrc_)));
-      offsetOut += size;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
deleted file mode 100644
index 410f4dd7c..000000000
--- a/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for sum-to-one normalization,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
- * \f]
- * where \f$in\f$ is a (batchSize x dataDim) input vector,
- * and \f$out\f$ is a (batchSize x dataDim) output vector.
- *
- * The config file api is sum_to_one_norm_layer.
- */
-
-class SumToOneNormLayer : public Layer {
- protected:
-  /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
-  MatrixPtr reciprocalRowSum_;
-  /// dotSum = output_.grad \f$.*\f$ output_.value
-  MatrixPtr dotSum_;
-
- public:
-  explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
-
-bool SumToOneNormLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SumToOneNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(dataDim, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSumToOneNormTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(reciprocalRowSum_, batchSize, 1, false, useGpu_);
-    inV->rowSum(*reciprocalRowSum_);
-
-    // todo: matrix checks
-    CHECK_GT(reciprocalRowSum_->getMin(), 0.0);
-
-    reciprocalRowSum_->scalarDiv(*reciprocalRowSum_, 1.0);
-
-    // outV = inV * reciprocalRowSum
-    outV->rowScale(0, *inV, *reciprocalRowSum_);
-  }
-}
-
-void SumToOneNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV->getHeight();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSumToOneTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
-
-    // dotSum = outG .* outV
-    dotSum_->zeroMem();
-    dotSum_->rowDotMul(0, *outG, *outV);
-
-    // inG += -1 * (dotSum / rowSum)
-    dotSum_->dotMul(*dotSum_, *reciprocalRowSum_);
-    inG->rowAdd(0, *inG, *dotSum_, -1.0);
-    // inG += outG * (1/rowSum)
-    inG->addRowScale(0, *outG, *reciprocalRowSum_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp b/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
deleted file mode 100644
index 513f3df7b..000000000
--- a/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOrderLayer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(switch_order, SwitchOrderLayer);
-
-bool SwitchOrderLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  auto& img_conf = config_.inputs(0).image_conf();
-  size_t inD = img_conf.img_size_z();
-  size_t inH =
-      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
-  size_t inW = img_conf.img_size();
-  size_t inC = img_conf.channels();
-  inH = inH * inD;
-  inDims_ = TensorShape({0, inC, inH, inW});
-  outDims_ = TensorShape(4);
-
-  auto& reshape_conf = config_.reshape_conf();
-  for (int i = 0; i < reshape_conf.height_axis_size(); i++) {
-    heightAxis_.push_back(reshape_conf.height_axis(i));
-  }
-  for (int i = 0; i < reshape_conf.width_axis_size(); i++) {
-    widthAxis_.push_back(reshape_conf.width_axis(i));
-  }
-  createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
-  createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
-  return true;
-}
-
-void SwitchOrderLayer::setOutDims() {
-  outDims_.setDim(0, inDims_[0]);
-  outDims_.setDim(1, inDims_[2]);
-  outDims_.setDim(2, inDims_[3]);
-  outDims_.setDim(3, inDims_[1]);
-  reshapeHeight_ = 1;
-  for (size_t i = 0; i < heightAxis_.size(); i++) {
-    reshapeHeight_ *= outDims_[heightAxis_[i]];
-  }
-  output_.setFrameHeight(reshapeHeight_);
-  reshapeWidth_ = 1;
-  for (size_t i = 0; i < widthAxis_.size(); i++) {
-    reshapeWidth_ *= outDims_[widthAxis_[i]];
-  }
-  output_.setFrameWidth(reshapeWidth_);
-}
-
-void SwitchOrderLayer::setInDims() {
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  inDims_.setDim(0, batchSize);
-  int d = inputLayers_[0]->getOutput().getFrameDepth();
-  d = (d == 0 ? 1 : d);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h * d);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-  int totalCount = input->getElementCnt();
-  int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
-  if (channels != 0) inDims_.setDim(1, channels);
-}
-
-void SwitchOrderLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  setInDims();
-  setOutDims();
-  resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
-  if (heightAxis_.size() > 0) {
-    resetOutput(reshapeHeight_, reshapeWidth_);
-  }
-
-  // switch NCHW to NHWC
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_);
-  nchw2nhwc_[0]->calc(inputs, outputs);
-  forwardActivation();
-}
-
-void SwitchOrderLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  backwardActivation();
-
-  // switch NHWC to NCHW
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  nhwc2nchw_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SwitchOrderLayer.h b/paddle/legacy/gserver/layers/SwitchOrderLayer.h
deleted file mode 100644
index 8a551a2bb..000000000
--- a/paddle/legacy/gserver/layers/SwitchOrderLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  This layer calculate softmax in image channel dimension.
- */
-class SwitchOrderLayer : public Layer {
- public:
-  explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~SwitchOrderLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  void setInDims();
-  void setOutDims();
-
- protected:
-  std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
-  std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
-  TensorShape inDims_;
-  TensorShape outDims_;
-  std::vector<int> heightAxis_;
-  std::vector<int> widthAxis_;
-  size_t reshapeHeight_;
-  size_t reshapeWidth_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TableProjection.cpp b/paddle/legacy/gserver/layers/TableProjection.cpp
deleted file mode 100644
index 326e241d0..000000000
--- a/paddle/legacy/gserver/layers/TableProjection.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TableProjection.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(table, TableProjection);
-
-TableProjection::TableProjection(const ProjectionConfig& config,
-                                 const ParameterPtr& parameter,
-                                 bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  table_.reset(
-      new Weight(config.input_size(), config.output_size(), parameter));
-}
-
-void TableProjection::prefetch(const Argument* in) {
-  CHECK(in->ids);
-  auto* sparseParam =
-      dynamic_cast<SparsePrefetchRowCpuMatrix*>(table_->getW().get());
-  if (sparseParam) {
-    sparseParam->addRows(in->ids);
-  }
-}
-
-void TableProjection::forward() {
-  CHECK(in_->ids);
-  out_->value->selectRows(*table_->getW(), *in_->ids);
-}
-
-void TableProjection::backward(const UpdateCallback& callback) {
-  if (table_->getWGrad()) {
-    CHECK(in_->ids);
-    out_->grad->addToRows(*table_->getWGrad(), *in_->ids);
-    parameter_->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TableProjection.h b/paddle/legacy/gserver/layers/TableProjection.h
deleted file mode 100644
index 60286149f..000000000
--- a/paddle/legacy/gserver/layers/TableProjection.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * Table projection takes index data input. It select rows from parameter
- * where row_id is in input_ids:
- * \f[
- *   out.row[i] += table.row[ids[i]]
- * \f]
- * where \f$out\f$ is out, \f$table\f$ is parameter, \f$ids\f$ is input_ids,
- * and \f$i\f$ is row_id.
- *
- * The config file api is table_projection.
- *
- * @note If \f$ids[i] = -1\f$, it will be ignored.
- */
-class TableProjection : public Projection {
- public:
-  TableProjection(const ProjectionConfig& config,
-                  const ParameterPtr& parameter,
-                  bool useGpu);
-  /**
-   * If use sparse row matrix as parameter, prefetch feature ids in input label.
-   */
-  virtual void prefetch(const Argument* in);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::unique_ptr<Weight> table_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TensorLayer.cpp b/paddle/legacy/gserver/layers/TensorLayer.cpp
deleted file mode 100644
index 7f874bce0..000000000
--- a/paddle/legacy/gserver/layers/TensorLayer.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TensorLayer.h"
-
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(tensor, TensorLayer);
-
-bool TensorLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weightList */
-  CHECK_EQ(inputLayers_.size(), 2LU);
-  CHECK(parameters_[0]);
-  CHECK(!parameters_[1]);
-
-  // Option the parameters
-  size_t height = inputLayers_[0]->getSize();
-  size_t width = inputLayers_[1]->getSize();
-  CHECK_EQ(width * height * getSize(), parameters_[0]->getSize());
-
-  for (size_t i = 0; i < getSize(); ++i) {
-    // create a new weight
-    Weight* w = new Weight(height, width, parameters_[0], i * width * height);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void TensorLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-
-  { resetOutput(batchSize, size); }
-
-  MatrixPtr outV = getOutputValue();
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* e1 * W * trans(e2) */ {
-    MatrixPtr input1 = getInputValue(0);
-    MatrixPtr input2 = getInputValue(1);
-    MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
-                                      input2->getWidth(),
-                                      /* trans= */ false,
-                                      input2->useGpu());
-    REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      MatrixPtr weights = weights_[i]->getW();
-      tmpMat->mul(*input1, *weights, 1, 0);
-      outV->rowDotMul(i, *tmpMat, *input2);
-    }
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void TensorLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  MatrixPtr input1 = getInputValue(0);
-  MatrixPtr input2 = getInputValue(1);
-  MatrixPtr oGrad = getOutputGrad();
-  MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
-                                    input1->getWidth(),
-                                    /* trans= */ false,
-                                    input1->useGpu());
-
-  /* trans(grad * e1) * e2 */ {
-    REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      if (weights_[i]->getWGrad()) {
-        tmpMat->rowScale(i, *input1, *oGrad);
-        MatrixPtr input1_T = tmpMat->getTranspose();
-        weights_[i]->getWGrad()->mul(*input1_T, *input2, 1, 1);
-      }
-    }
-  }
-
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */ {
-    MatrixPtr preGrad1 = getInputGrad(0);
-    MatrixPtr preGrad2 = getInputGrad(1);
-
-    REGISTER_TIMER_INFO("TensorBpMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      MatrixPtr weights = weights_[i]->getW();
-
-      if (NULL != preGrad1) { /* (grad * e2) * trans(W) */
-        tmpMat->rowScale(i, *input2, *oGrad);
-        MatrixPtr weights_T = weights->getTranspose();
-        preGrad1->mul(*tmpMat, *weights_T, 1, 1);
-      }
-      if (NULL != preGrad2) { /* (grad * e1) * W */
-        tmpMat->rowScale(i, *input1, *oGrad);
-        preGrad2->mul(*tmpMat, *weights, 1, 1);
-      }
-    }
-  }
-  hl_set_sync_flag(syncFlag);
-  parameters_[0]->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TensorLayer.h b/paddle/legacy/gserver/layers/TensorLayer.h
deleted file mode 100644
index fc491a7c9..000000000
--- a/paddle/legacy/gserver/layers/TensorLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief TensorLayer takes two input vectors.
- * \f[
- *     y_{i} = x_{1} * W_{i} * x_{2}^{\rm T}, i=0, 1, ...,K-1
- * \f]
- *
- * - \f$x_{1}\f$: the first input, size is M.
- * - \f$x_{2}\f$: the second input, size is N.
- * - y: output, size is K.
- * - \f$y_{i}\f$: i-th element of y.
- * - \f$W_{i}\f$: the i-th learned weight, dimensions: [M, N].
- * - \f$x_{2}^{\rm T}\f$: the transpose of \f$x_{2}\f$.
- *
- * The config file api is tensor_layer.
- */
-
-class TensorLayer : public Layer {
- protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
- public:
-  explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransLayer.cpp b/paddle/legacy/gserver/layers/TransLayer.cpp
deleted file mode 100644
index fd1d435ea..000000000
--- a/paddle/legacy/gserver/layers/TransLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TransLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-namespace paddle {
-
-REGISTER_LAYER(trans, TransLayer);
-
-bool TransLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for trans-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  return true;
-}
-
-void TransLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  MatrixPtr input = getInputValue(0);
-  int height = input->getHeight();
-  int width = input->getWidth();
-
-  resizeOutput(width, height);
-
-  MatrixPtr outV = getOutputValue();
-
-  /* outV's memory has been allocated, so memAlloc = false */
-  input->transpose(outV, false);
-  if (getInputGrad(0)) {
-    zeroGrad();
-  }
-}
-
-void TransLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr outputGrad = getOutputGrad();
-  if (outputGrad == NULL) {
-    return;
-  }
-  MatrixPtr preGrad = getInputGrad(0);
-  if (preGrad) {
-    MatrixPtr transGrad = Matrix::create(preGrad->getHeight(),
-                                         preGrad->getWidth(),
-                                         /* trans= */ false,
-                                         preGrad->useGpu());
-    outputGrad->transpose(transGrad, false);
-    preGrad->add(*transGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransLayer.h b/paddle/legacy/gserver/layers/TransLayer.h
deleted file mode 100644
index 0a6b13933..000000000
--- a/paddle/legacy/gserver/layers/TransLayer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for transposing a minibatch matrix.
- * \f[
-     y = x^\mathrm{T}
- * \f]
- * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
- *
- * The config file api is trans_layer.
- */
-class TransLayer : public Layer {
- public:
-  explicit TransLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
deleted file mode 100644
index c8533dc7d..000000000
--- a/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief TransposedFullMatrixProjection performs full matrix multiplication:
- * out.row[i] += in.row[i] * weight.transpose
- *
- * The config file api is trans_full_matrix_projection.
- */
-class TransposedFullMatrixProjection : public Projection {
- public:
-  TransposedFullMatrixProjection(const ProjectionConfig& config,
-                                 ParameterPtr parameter,
-                                 bool useGPu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
- protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(trans_fc, TransposedFullMatrixProjection);
-
-TransposedFullMatrixProjection::TransposedFullMatrixProjection(
-    const ProjectionConfig& config, ParameterPtr parameter, bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  weight_.reset(
-      new Weight(config.output_size(), config.input_size(), parameter));
-}
-
-void TransposedFullMatrixProjection::forward() {
-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  out_->value->mul(*(in_->value), *(weight_->getW()->getTranspose()), 1, 1);
-}
-
-void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  if (weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-    weight_->getWGrad()->mul(
-        *(out_->grad->getTranspose()), *(in_->value), 1, 1);
-  }
-
-  // If callback does not change value, backprop error asynchronously so that
-  // we can do the callback concurrently.
-  // This is still a little bit dangerous since theoretically for
-  // SyncMultiGpuMachine it is possible that the value copyback can still
-  // happen at the same time as the error backprop where the value is being
-  // used.
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */
-  if (in_->grad) {
-    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-    in_->grad->mul(*(out_->grad), *(weight_->getW()), 1, 1);
-  }
-
-  hl_set_sync_flag(syncFlag);
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/UpsampleLayer.cpp b/paddle/legacy/gserver/layers/UpsampleLayer.cpp
deleted file mode 100644
index 3ff5332e6..000000000
--- a/paddle/legacy/gserver/layers/UpsampleLayer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-    limitations under the License. */
-
-#include "UpsampleLayer.h"
-#include "iostream"
-
-namespace paddle {
-
-REGISTER_LAYER(upsample, UpsampleLayer);
-
-size_t UpsampleLayer::getOutputSize() {
-  if (upsampleSize_ == 0) {
-    upsampleSize_ = imgSize_ * scale_ - static_cast<int>(padOutX_);
-    upsampleSizeY_ = imgSizeY_ * scaleY_ - static_cast<int>(padOutY_);
-  }
-  return upsampleSize_ * upsampleSizeY_ * channels_;
-}
-
-bool UpsampleLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-  CHECK_EQ(config_.inputs_size(), 2);
-  const auto& conf = config_.inputs(0).upsample_conf();
-  const auto& img_conf = conf.image_conf();
-
-  imgSizeY_ =
-      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
-  imgSize_ = img_conf.img_size();
-  channels_ = img_conf.channels();
-
-  CHECK((conf.has_upsample_size()) || (conf.has_scale()))
-      << "scale or upsample_size is required.";
-
-  if (conf.has_upsample_size()) {
-    upsampleSize_ = conf.upsample_size();
-    upsampleSizeY_ = upsampleSize_;
-    if (conf.has_upsample_size_y()) {
-      upsampleSizeY_ = conf.upsample_size_y();
-    }
-  } else {
-    if (!conf.has_scale_y()) {
-      scale_ = scaleY_ = conf.scale_y();
-      CHECK_GT(static_cast<int>(scale_), 1);
-    } else {
-      scale_ = conf.scale();
-      scaleY_ = conf.scale_y();
-    }
-    padOutX_ = conf.pad_out_x();
-    padOutY_ = conf.pad_out_y();
-    CHECK(!padOutX_ || scale_ == 2)
-        << "Output height padding compensation requires scale_ == 2";
-    CHECK(!padOutY_ || scaleY_ == 2)
-        << "Output width padding compensation requires scaleY_ == 2";
-    upsampleSize_ = upsampleSizeY_ = 0;
-  }
-  return true;
-}
-
-void UpsampleLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr input = getInputValue(0);
-  MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
-
-  size_t batchSize = input->getHeight();
-  size_t outSize = getOutputSize();
-
-  CHECK_EQ(input->getWidth(), mask->getWidth());
-  CHECK_EQ(mask->getHeight(), batchSize);
-  resetOutput(batchSize, outSize);
-
-  MatrixPtr output = getOutputValue();
-  output->upsampleForward(*input,
-                          *mask,
-                          imgSize_,
-                          imgSizeY_,
-                          channels_,
-                          upsampleSize_,
-                          upsampleSizeY_);
-}
-
-void UpsampleLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  inputGrad->upsampleBackward(*outputGrad,
-                              *mask,
-                              imgSize_,
-                              imgSizeY_,
-                              channels_,
-                              upsampleSize_,
-                              upsampleSizeY_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/UpsampleLayer.h b/paddle/legacy/gserver/layers/UpsampleLayer.h
deleted file mode 100644
index 2fe593824..000000000
--- a/paddle/legacy/gserver/layers/UpsampleLayer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * This layer transpose the pooling process.
- * It takes two input, the first input is the input data, and
- * the second is the mask data from the max-pool-with-mask layer.
- *
- */
-
-class UpsampleLayer : public Layer {
- public:
-  explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
-  ~UpsampleLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  size_t getOutputSize();
-
- protected:
-  size_t scale_, scaleY_;
-  size_t upsampleSize_, upsampleSizeY_;
-  size_t padOutX_, padOutY_;
-  size_t imgSize_, imgSizeY_;
-  size_t channels_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.cpp b/paddle/legacy/gserver/layers/ValidationLayer.cpp
deleted file mode 100644
index 9956fd2ed..000000000
--- a/paddle/legacy/gserver/layers/ValidationLayer.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <fstream>
-#include <memory>
-
-#include "ValidationLayer.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-bool ValidationLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  return Layer::init(layerMap, parameterMap);
-}
-
-void ValidationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  CHECK(output);
-  IVectorPtr label = getInputLabel(*getLabelLayer());
-  CHECK(label);
-  validationImp(output, label);
-}
-
-void ValidationLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-}
-
-bool AucValidation::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  bool ret = ValidationLayer::init(layerMap, parameterMap);
-  EvaluatorConfig config;
-  config.set_name(getName());
-  config.set_type("last-column-auc");
-  config.add_input_layers(inputLayers_[0]->getName());
-  config.add_input_layers(inputLayers_[1]->getName());
-  if (3 == inputLayers_.size()) {
-    config.add_input_layers(inputLayers_[2]->getName());
-  }
-  evaluator_.reset(Evaluator::create(config));
-  passBegin_ = false;
-  return ret;
-}
-
-void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
-  if (!passBegin_) {
-    passBegin_ = true;
-    evaluator_->start();
-  }
-
-  bool supportWeight = (3 == inputLayers_.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? getInputValue(*inputLayers_[2]) : nullptr;
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    size_t height = output->getHeight();
-    size_t width = output->getWidth();
-    Matrix::resizeOrCreate(cpuOutput_,
-                           height,
-                           width,
-                           /* trans=*/false,
-                           /* useGpu=*/false);
-    cpuOutput_->copyFrom(*output);
-    IVector::resizeOrCreate(cpuLabel_, height, false);
-    cpuLabel_->copyFrom(*label);
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-    }
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    weight = cpuWeight_;
-  }
-
-  for (size_t i = 0; i < output->getHeight(); i++) {
-    float y1 = output->getData()[i * output->getWidth() + 1];
-    int* labels = label->getData();
-    predictArray_.push_back(PredictionResult(y1, labels[i]));
-  }
-  std::vector<Argument> arguments;
-  if (3 == inputLayers_.size()) {
-    arguments.resize(3);
-    arguments[2].value = weight;
-  } else {
-    arguments.resize(2);
-  }
-  arguments[0].value = output;
-  arguments[1].ids = label;
-  evaluator_->evalImp(arguments);
-}
-
-void AucValidation::onPassEnd() {
-  if (!FLAGS_predict_file.empty()) {
-    std::ofstream fs(FLAGS_predict_file);
-    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
-    for (auto& res : predictArray_) {
-      fs << res.out << " " << res.label << std::endl;
-    }
-  }
-
-  evaluator_->finish();
-  LOG(INFO) << *evaluator_;
-  passBegin_ = false;
-  predictArray_.clear();
-}
-
-bool PnpairValidation::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  bool ret = ValidationLayer::init(layerMap, parameterMap);
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 3UL);
-  CHECK_LE(inputLayers_.size(), 4UL);
-  EvaluatorConfig config;
-  config.set_name(getName());
-  config.set_type("pnpair");
-  config.add_input_layers(inputLayers_[0]->getName());
-  config.add_input_layers(inputLayers_[1]->getName());
-  config.add_input_layers(inputLayers_[2]->getName());
-  if (4 == inputLayers_.size()) {
-    config.add_input_layers(inputLayers_[3]->getName());
-  }
-  evaluator_.reset(Evaluator::create(config));
-  passBegin_ = false;
-  return true;
-}
-
-void PnpairValidation::validationImp(MatrixPtr output, IVectorPtr label) {
-  if (!passBegin_) {
-    passBegin_ = true;
-    evaluator_->start();
-  }
-  MatrixPtr weight =
-      (4 == inputLayers_.size()) ? getInputValue(*inputLayers_[3]) : nullptr;
-  IVectorPtr info = getInputLabel(*getInfoLayer());
-  std::vector<Argument> arguments;
-  if (4 == inputLayers_.size()) {
-    arguments.resize(4);
-    arguments[3].value = weight;
-  } else {
-    arguments.resize(3);
-  }
-  arguments[0].value = output;
-  arguments[1].ids = label;
-  arguments[2].ids = info;
-  evaluator_->evalImp(arguments);
-}
-
-void PnpairValidation::onPassEnd() {
-  if (!FLAGS_predict_file.empty()) {
-    (dynamic_cast<PnpairEvaluator*>(evaluator_.get()))->printPredictResults();
-  }
-  evaluator_->finish();
-  LOG(INFO) << *evaluator_;
-  passBegin_ = false;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.h b/paddle/legacy/gserver/layers/ValidationLayer.h
deleted file mode 100644
index fbc94e8ef..000000000
--- a/paddle/legacy/gserver/layers/ValidationLayer.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-
-#include "Layer.h"
-#include "paddle/legacy/gserver/evaluators/Evaluator.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-class ValidationLayer : public Layer {
- public:
-  explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[1]; }
-
-  LayerPtr getInfoLayer() {
-    assert(inputLayers_.size() > 2);
-    return inputLayers_[2];
-  }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
-
-  void onPassEnd() override = 0;
-};
-
-/*
- * AucValidation
- */
-class AucValidation : public ValidationLayer {
- public:
-  explicit AucValidation(const LayerConfig& config)
-      : ValidationLayer(config),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label) : out(__out), label(__label) {}
-    real out;
-    int label;
-  };
-  std::vector<PredictionResult> predictArray_;
-
- private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-};
-
-/*
- * positive-negative pair rate Validation
- */
-class PnpairValidation : public ValidationLayer {
- public:
-  explicit PnpairValidation(const LayerConfig& config)
-      : ValidationLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
- private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-};
-
-typedef std::shared_ptr<ValidationLayer> ValidationLayerPtr;
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/WarpCTCLayer.cpp b/paddle/legacy/gserver/layers/WarpCTCLayer.cpp
deleted file mode 100644
index 6b1656a52..000000000
--- a/paddle/legacy/gserver/layers/WarpCTCLayer.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "WarpCTCLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(warp_ctc, WarpCTCLayer);
-
-bool WarpCTCLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  /* Initialize the basic parament class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2UL);
-
-  /* The inputLayers_[0] must be sequence output without softmax */
-  numClasses_ = config_.size();
-  CHECK_GE(numClasses_, 2UL);
-  CHECK_EQ(numClasses_, inputLayers_[0]->getSize());
-
-  blank_ = config_.blank();
-  CHECK_LT(blank_, numClasses_);
-
-  normByTimes_ = config_.norm_by_times();
-
-  // We don't need sequenceStartPositions because each sample of output_ is
-  // for the cost of one sequence.
-  setNeedSequenceInfo(false);
-
-  return true;
-}
-
-void WarpCTCLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& output = getInput(0);
-  const Argument& labels = getInput(1);
-
-  CHECK(output.sequenceStartPositions);
-  CHECK(labels.sequenceStartPositions);
-  CHECK(labels.ids);
-
-  size_t numSequences = labels.sequenceStartPositions->getSize() - 1;
-  CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1);
-
-  resizeOutput(numSequences, 1);
-
-  const int* cpuLabelStartPositions =
-      labels.sequenceStartPositions->getData(false);
-  const int* cpuOutputStartPositions =
-      output.sequenceStartPositions->getData(false);
-
-  std::vector<int> cpuLabelLengths(numSequences);
-  std::vector<int> cpuOutputLengths(numSequences);
-  for (size_t i = 0; i < numSequences; i++) {
-    cpuLabelLengths[i] =
-        cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i];
-    cpuOutputLengths[i] =
-        cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i];
-  }
-
-  /* Get the maximum sequence length */
-  maxSequenceLength_ = 0;
-  maxSequenceLength_ = *std::max_element(
-      cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences);
-
-  Matrix::resizeOrCreate(batchValue_,
-                         /* height */ numSequences * maxSequenceLength_,
-                         /* width */ numClasses_,
-                         /* trans */ false,
-                         /* useGpu */ useGpu_);
-
-  Matrix::resizeOrCreate(batchGrad_,
-                         /* height */ numSequences * maxSequenceLength_,
-                         /* width */ numClasses_,
-                         /* trans */ false,
-                         /* useGpu */ useGpu_);
-  batchGrad_->zeroMem();
-
-  seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions);
-
-  /* labels always in CPU memory */
-  IVector::resizeOrCreate(cpuLabels_,
-                          /* size */ (labels.ids)->getSize(),
-                          /* useGpu */ false);
-  cpuLabels_->copyFrom(*(labels.ids));
-
-  /* labels always in CPU memory */
-  Matrix::resizeOrCreate(cpuCosts_,
-                         /* height */ numSequences,
-                         /* width */ 1,
-                         /* trans */ false,
-                         /* useGpu */ false);
-
-  /* Init warp-ctc options */
-  hl_warpctc_options_t options;
-  hl_warpctc_init(blank_, useGpu_, &options);
-
-  /* Get the needed workspace size */
-  size_t workspaceBytes = 0;
-  hl_warpctc_get_workspace_size(cpuLabelLengths.data(),
-                                cpuOutputLengths.data(),
-                                numClasses_,
-                                numSequences,
-                                &options,
-                                &workspaceBytes);
-  CHECK_GT(workspaceBytes, 0UL);
-
-  size_t workspaceLength = workspaceBytes / sizeof(real) + 1;
-  Vector::resizeOrCreate(workspace_,
-                         /* size */ workspaceLength,
-                         /* useGpu */ useGpu_);
-
-  hl_warpctc_compute_loss(batchValue_->getData(),
-                          batchGrad_->getData(),
-                          cpuLabels_->getData(),
-                          cpuLabelLengths.data(),
-                          cpuOutputLengths.data(),
-                          numClasses_,
-                          numSequences,
-                          cpuCosts_->getData(),
-                          workspace_->getData(),
-                          &options);
-
-  /* Copy the costs */
-  output_.value->copyFrom(*cpuCosts_);
-}
-
-void WarpCTCLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  const Argument& output = getInput(0);
-  CHECK(batchGrad_);
-
-  batch2seqPadding(
-      output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_);
-}
-
-void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue,
-                                    MatrixPtr& batchValue,
-                                    const ICpuGpuVectorPtr& seqStartPositions) {
-  size_t numSequences = seqStartPositions->getSize() - 1;
-  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
-
-  real* seqData = seqValue->getData();
-  real* batchData = batchValue->getData();
-  if (useGpu_) {
-    hl_sequence2batch_copy_padding(batchData,
-                                   seqData,
-                                   seqStartPositionsData,
-                                   numClasses_,
-                                   maxSequenceLength_,
-                                   numSequences,
-                                   false,
-                                   true);
-  } else {
-    for (size_t i = 0; i < maxSequenceLength_; i++) {
-      for (size_t j = 0; j < numSequences; j++) {
-        size_t sequenceStart = seqStartPositionsData[j];
-        size_t sequenceLength =
-            seqStartPositionsData[j + 1] - seqStartPositionsData[j];
-        if (i < sequenceLength) {
-          memcpy(batchData + (i * numSequences + j) * numClasses_,
-                 seqData + (sequenceStart + i) * numClasses_,
-                 numClasses_ * sizeof(real));
-        } else {
-          memset(batchData + (i * numSequences + j) * numClasses_,
-                 0,
-                 numClasses_ * sizeof(real));
-        }
-      }
-    }
-  }
-}
-
-void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue,
-                                    MatrixPtr& batchValue,
-                                    const ICpuGpuVectorPtr& seqStartPositions,
-                                    bool normByTimes) {
-  size_t numSequences = seqStartPositions->getSize() - 1;
-  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
-
-  real* seqData = seqValue->getData();
-  real* batchData = batchValue->getData();
-  if (useGpu_) {
-    hl_sequence2batch_copy_padding(batchData,
-                                   seqData,
-                                   seqStartPositionsData,
-                                   numClasses_,
-                                   maxSequenceLength_,
-                                   numSequences,
-                                   normByTimes,
-                                   false);
-  } else {
-    for (size_t i = 0; i < numSequences; i++) {
-      int sequenceStart = seqStartPositionsData[i];
-      int sequenceLength =
-          seqStartPositionsData[i + 1] - seqStartPositionsData[i];
-      real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
-      for (int j = 0; j < sequenceLength; j++) {
-        for (size_t k = 0; k < numClasses_; k++) {
-          seqData[(sequenceStart + j) * numClasses_ + k] =
-              batchData[(j * numSequences + i) * numClasses_ + k] * scale;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/WarpCTCLayer.h b/paddle/legacy/gserver/layers/WarpCTCLayer.h
deleted file mode 100644
index 3017ca794..000000000
--- a/paddle/legacy/gserver/layers/WarpCTCLayer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * @brief A layer integrating the open-source warp-ctc library
- *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
- *        temporal classification cost.
- *
- * The config file api is warp_ctc_layer.
- */
-class WarpCTCLayer : public Layer {
- public:
-  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
-  ~WarpCTCLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
- protected:
-  /**
-   * sequence matrix and batch matrix copy:
-   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
-   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
-   */
-  void seq2batchPadding(const MatrixPtr& seqValue,
-                        MatrixPtr& batchValue,
-                        const ICpuGpuVectorPtr& seqStartPositions);
-  void batch2seqPadding(const MatrixPtr& seqValue,
-                        MatrixPtr& batchValue,
-                        const ICpuGpuVectorPtr& seqStartPositions,
-                        bool normByTimes);
-
- protected:
-  size_t numClasses_;
-  size_t blank_;
-  size_t maxSequenceLength_;
-  bool normByTimes_;
-
-  MatrixPtr batchValue_;
-  MatrixPtr batchGrad_;
-  VectorPtr workspace_;
-
-  IVectorPtr cpuLabels_;
-  MatrixPtr cpuCosts_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/gserver/tests/.gitignore b/paddle/legacy/gserver/tests/.gitignore
deleted file mode 100644
index 7f1845d7e..000000000
--- a/paddle/legacy/gserver/tests/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-pyDataProviderBase.py
diff --git a/paddle/legacy/gserver/tests/CMakeLists.txt b/paddle/legacy/gserver/tests/CMakeLists.txt
deleted file mode 100644
index 93ddf5aa2..000000000
--- a/paddle/legacy/gserver/tests/CMakeLists.txt
+++ /dev/null
@@ -1,103 +0,0 @@
-# gserver pacakge unittests
-add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_RecurrentLayer)
-
-if(NOT MOBILE_INFERENCE)
-  add_simple_unittest(test_MultinomialSampler)
-endif()
-
-function(gserver_test TARGET)
-  add_unittest_without_exec(${TARGET}
-      ${TARGET}.cpp
-      LayerGradUtil.cpp)
-  add_test(NAME ${TARGET}
-      COMMAND ${TARGET})
-endfunction()
-
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
-    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
-)
-add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
-
-gserver_test(test_LayerGrad)
-gserver_test(test_CRFLayerGrad)
-gserver_test(test_CrossEntropyOverBeamGrad)
-gserver_test(test_SeqSliceLayerGrad)
-gserver_test(test_ActivationGrad)
-gserver_test(test_ConvTrans)
-gserver_test(test_PriorBox)
-gserver_test(test_DetectionOutput)
-gserver_test(test_ConvUnify)
-gserver_test(test_BatchNorm)
-gserver_test(test_KmaxSeqScore)
-gserver_test(test_Expand)
-gserver_test(test_MaxPoolingWithMaskOutput)
-gserver_test(test_Upsample)
-
-set(PYTHON_PATH 
-   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/gserver/tests)
-function(gserver_test_with_python TARGET)
-  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-  add_test(NAME ${TARGET}
-    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endfunction()
-
-gserver_test_with_python(test_PyDataProvider2)
-if(WITH_PYTHON)
-    gserver_test_with_python(test_PyDataProvider)
-endif()
-if(NOT MOBILE_INFERENCE)
-    gserver_test_with_python(test_CompareTwoNets)
-    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
-    gserver_test_with_python(test_RecurrentGradientMachine)
-endif()
-
-########## test_MKLDNN layers and activations ##########
-if(WITH_MKLDNN)
-    add_unittest_without_exec(test_MKLDNN
-        test_MKLDNN.cpp
-        MKLDNNTester.cpp
-        LayerGradUtil.cpp)
-    add_test(NAME test_MKLDNN
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-endif()
-
-############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
-    add_unittest_without_exec(test_WarpCTCLayer
-        test_WarpCTCLayer.cpp)
-    add_test(NAME test_WarpCTCLayer
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-    ################## test_Evaluator #############
-    add_unittest(test_Evaluator
-        test_Evaluator.cpp)
-      
-    ########### test_NetworkCompare ###############
-    add_unittest_without_exec(test_NetworkCompare
-        test_NetworkCompare.cpp)
-    if(WITH_GPU)
-        set(use_gpu true)
-    else()
-        set(use_gpu false)
-    endif()
-    add_test(NAME test_NetworkCompare
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
-
-    ############ test_CompareSparse ################
-    add_unittest_without_exec(test_CompareSparse
-        test_CompareSparse.cpp)
-    if(NOT ON_TRAVIS)
-      add_test(NAME test_CompareSparse
-        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
-                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-    endif()
-endif()
diff --git a/paddle/legacy/gserver/tests/LayerGradUtil.cpp b/paddle/legacy/gserver/tests/LayerGradUtil.cpp
deleted file mode 100644
index f08c1cd1d..000000000
--- a/paddle/legacy/gserver/tests/LayerGradUtil.cpp
+++ /dev/null
@@ -1,854 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LayerGradUtil.h"
-
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-namespace paddle {
-real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
-  testLayer->forward(PASS_GC);
-  std::vector<Argument> outArgs;
-  outArgs.push_back(testLayer->getOutput());
-  if (weights) {
-    outArgs[0].value->dotMul(*outArgs[0].value, *weights);
-  }
-  return Argument::sum(outArgs);
-}
-
-real getDiffAndPrint(real newCost1,
-                     real newCost2,
-                     real callbackCount,
-                     char fill,
-                     string testLayerName,
-                     string name,
-                     real step,
-                     real delta) {
-  EXPECT_FALSE(std::isnan(newCost1));
-  EXPECT_FALSE(std::isnan(newCost2));
-
-  real trueDelta = (newCost1 - newCost2) * (callbackCount / 2.);
-  real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
-  LOG(INFO) << setiosflags(ios::left) << setfill(fill) << setw(20)
-            << testLayerName << " " << setw(20) << name << "step=" << setw(15)
-            << step << "cost1=" << setw(10) << newCost1 << "cost2=" << setw(10)
-            << newCost2 << "true_delta=" << setw(15) << trueDelta
-            << "analytic_delta=" << setw(15) << delta << "diff=" << diff
-            << (abs(diff) > 0.01 ? " ***" : "");
-  if (fabs(diff - 1) < 0.02) {
-    LOG(INFO) << "The previous diff might be caused by not accumulating"
-              << " parameter gradients in backward()";
-  }
-  return diff;
-}
-
-void testState(LayerPtr testLayer,
-               vector<DataLayerPtr>& dataLayers,
-               vector<Argument>& datas) {
-  auto batchSize = datas[0].getBatchSize();
-  Argument data;
-  ICpuGpuVectorPtr sequenceStartPositions =
-      ICpuGpuVector::create(2, /* useGpu= */ false);
-  sequenceStartPositions->getMutableData(false)[0] = 0;
-  sequenceStartPositions->getMutableData(false)[1] = batchSize;
-  data.sequenceStartPositions = sequenceStartPositions;
-  testLayer->resetState();
-  for (size_t j = 0; j < datas.size(); ++j) {
-    if (datas[j].value) {
-      data.value = datas[j].value;
-    }
-    if (datas[j].ids) {
-      data.ids = datas[j].ids;
-    }
-    dataLayers[j]->setData(data);
-    dataLayers[j]->forward(PASS_TEST);
-  }
-  testLayer->forward(PASS_TEST);
-  Argument batchOut;
-  batchOut.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-
-  sequenceStartPositions->getMutableData(false)[1] = 1;
-  testLayer->resetState();
-
-  auto testLayerState = [&](int batchId) {
-    for (size_t j = 0; j < datas.size(); ++j) {
-      if (datas[j].value) {
-        data.value = datas[j].value->subMatrix(batchId, 1);
-      }
-      if (datas[j].ids) {
-        data.ids = IVector::create(
-            datas[j].ids->getData() + batchId, 1, FLAGS_use_gpu);
-      }
-      dataLayers[j]->setData(data);
-      dataLayers[j]->forward(PASS_TEST);
-    }
-
-    testLayer->forward(PASS_TEST);
-    Argument out;
-    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    if (batchOut.value) {
-      size_t dim = batchOut.value->getWidth();
-      ASSERT_TRUE((bool)out.value);
-      EXPECT_EQ(dim, out.value->getWidth());
-      EXPECT_EQ(1UL, out.value->getHeight());
-      auto ret = std::mismatch(batchOut.value->getData() + batchId * dim,
-                               batchOut.value->getData() + (batchId + 1) * dim,
-                               out.value->getData());
-      if (ret.second != out.value->getData() + dim) {
-        // If reaches here, the test will fail
-        EXPECT_EQ(*ret.first, *ret.second);
-      }
-    } else if (batchOut.ids) {
-      ASSERT_TRUE((bool)out.ids);
-      EXPECT_EQ(1UL, out.ids->getSize());
-      EXPECT_EQ(batchOut.ids->getElement(batchId), out.ids->getElement(0));
-    }
-  };
-
-  CHECK_GT(batchSize, 0);
-  std::vector<LayerStatePtr> statePtrs;
-  statePtrs.reserve(batchSize);
-
-  // Test layer setState() and getState()
-  for (int i = 0; i < batchSize; ++i) {
-    statePtrs.push_back(testLayer->getState());
-    testLayerState(i);
-  }
-  for (int k = 0; k < batchSize - 1; ++k) {
-    testLayer->setState(statePtrs[k]);
-    for (int i = k; i < batchSize; ++i) {
-      testLayerState(i);
-    }
-  }
-}
-
-void testBatchState(LayerPtr testLayer,
-                    vector<DataLayerPtr>& dataLayers,
-                    vector<Argument>& datas) {
-  auto batchSize = datas[0].getBatchSize();
-  Argument data;
-  /*two sequences*/
-  size_t numSequences = 2;
-  ICpuGpuVectorPtr sequenceStartPositions =
-      ICpuGpuVector::create(numSequences + 1, /* useGpu= */ false);
-  int* cpuStarts = sequenceStartPositions->getMutableData(false);
-  int len = ::rand() % (batchSize - 1);
-  cpuStarts[0] = 0;
-  cpuStarts[1] = len > 0 ? len : 1;
-  cpuStarts[2] = batchSize;
-
-  data.sequenceStartPositions = sequenceStartPositions;
-  for (size_t j = 0; j < datas.size(); ++j) {
-    if (datas[j].value) {
-      data.value = datas[j].value;
-    }
-    if (datas[j].ids) {
-      data.ids = datas[j].ids;
-    }
-    dataLayers[j]->setData(data);
-    dataLayers[j]->forward(PASS_TEST);
-  }
-  testLayer->resetState();
-  testLayer->forward(PASS_TEST);
-  Argument batchOut;
-  batchOut.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-
-  /*split one miniBatch into two miniBatchs*/
-  std::vector<int> seqSplitPos;
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    int len = ::rand() % (cpuStarts[seqId + 1] - cpuStarts[seqId]);
-    len = len > 0 ? len : 1;
-    seqSplitPos.push_back(cpuStarts[seqId] + len);
-  }
-
-  std::vector<int> start; /*seq start pos in source data*/
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    start.push_back(cpuStarts[seqId]);
-  }
-  testLayer->resetState();
-  Argument splitData;
-  for (size_t batchId = 0; batchId < 2; ++batchId) {
-    size_t splitBatchSize = 0;
-    std::vector<int> seqLens;
-    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-      int seqLen = (batchId == 0) ? seqSplitPos[seqId] - cpuStarts[seqId]
-                                  : cpuStarts[seqId + 1] - seqSplitPos[seqId];
-      seqLens.push_back(seqLen);
-      splitBatchSize += seqLen;
-    }
-    ICpuGpuVectorPtr cpuSeqStartPos =
-        ICpuGpuVector::create(3, /* useGpu= */ false);
-    int* seqStartPosData = cpuSeqStartPos->getMutableData(false);
-    seqStartPosData[0] = 0;
-    seqStartPosData[1] = seqLens[0];
-    seqStartPosData[2] = splitBatchSize;
-
-    CHECK_GT(splitBatchSize, size_t(0));
-    splitData.sequenceStartPositions = cpuSeqStartPos;
-    for (size_t j = 0; j < datas.size(); ++j) {
-      if (datas[j].value) {
-        Matrix::resizeOrCreate(splitData.value,
-                               splitBatchSize,
-                               datas[j].value->getWidth(),
-                               false,
-                               FLAGS_use_gpu);
-        for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-          if (seqLens[seqId]) {
-            splitData.value->subMatrix(seqStartPosData[seqId], seqLens[seqId])
-                ->copyFrom(
-                    *datas[j].value->subMatrix(start[seqId], seqLens[seqId]));
-          }
-        }
-      }
-      if (datas[j].ids) {
-        IVector::resizeOrCreate(splitData.ids, splitBatchSize, FLAGS_use_gpu);
-        for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-          if (seqLens[seqId]) {
-            splitData.ids->subVec(seqStartPosData[seqId], seqLens[seqId])
-                ->copyFrom(*datas[j].ids->subVec(start[seqId], seqLens[seqId]));
-          }
-        }
-      }
-      dataLayers[j]->setData(splitData);
-      dataLayers[j]->forward(PASS_TEST);
-    }
-
-    testLayer->forward(PASS_TEST);
-    Argument out;
-    out.resizeAndCopyFrom(testLayer->getOutput(), /* useGpu= */ false);
-    if (batchOut.value) {
-      size_t dim = batchOut.value->getWidth();
-      ASSERT_TRUE((bool)out.value);
-      EXPECT_EQ(dim, out.value->getWidth());
-      for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-        if (seqLens[seqId]) {
-          out.value->subMatrix(seqStartPosData[seqId], seqLens[seqId])
-              ->sub(*batchOut.value->subMatrix(start[seqId], seqLens[seqId]));
-        }
-      }
-    }
-
-    std::vector<Argument> args;
-    args.push_back(out);
-    ASSERT_NEAR(0, Argument::sum(args), 1e-5) << "testBatchState failed";
-    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-      start[seqId] += seqLens[seqId];
-    }
-  }
-}
-
-double genPerturbation(const real* oldGrad, real* newGrad, size_t dim) {
-  double gradNorm = 0, dNorm = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    newGrad[i] = 2. * rand() / RAND_MAX - 1;  // NOLINT
-    dNorm += newGrad[i] * newGrad[i];
-    gradNorm += oldGrad[i] * oldGrad[i];
-  }
-  if (gradNorm > 0) {
-    real s = 0.5 * sqrt(gradNorm / dNorm);
-    for (size_t i = 0; i < dim; ++i) {
-      newGrad[i] = s * newGrad[i] + oldGrad[i];
-    }
-  }
-  double delta = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    delta += oldGrad[i] * newGrad[i];
-  }
-  return delta;
-}
-
-void initWeight(MatrixPtr& weights) {
-  MatrixPtr tmpMat = weights->clone();
-  for (int i = 0; i < int(tmpMat->getElementCnt()); i++) {
-    tmpMat->getData()[i] = (11 - 2 * (i % 11));
-  }
-  weights->copyFrom(*tmpMat);
-}
-
-void initBatchState(LayerPtr dataLayer,
-                    LayerPtr testLayer,
-                    LayerStatePtr state,
-                    bool useGpu) {
-  int sequenceNum = dataLayer->getOutput().getNumSequences();
-  MatrixPtr prevBatchOutput =
-      Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu);
-  MatrixPtr prevBatchState =
-      Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu);
-  prevBatchOutput->randomizeUniform();
-  prevBatchState->randomizeUniform();
-  state->value.clear();
-  state->value.push_back(prevBatchOutput);
-  state->value.push_back(prevBatchState);
-}
-
-void initDataLayer(TestConfig testConf,
-                   std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas,
-                   LayerMap* layerMap,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu) {
-  ICpuGpuVectorPtr sequenceStartPositions;
-  ICpuGpuVectorPtr subSequenceStartPositions;
-  IVectorPtr cpuSequenceDims;
-  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
-    if (testConf.inputDefs[i].inputType != INPUT_SEQUENCE_LABEL) continue;
-
-    const std::vector<int>& labelSeqStartPositions =
-        testConf.inputDefs[i].labelSeqStartPositions;
-    if (labelSeqStartPositions.size() != 0) {
-      CHECK(!sequenceStartPositions);
-      CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
-
-      sequenceStartPositions =
-          ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
-      sequenceStartPositions->copyFrom(
-          labelSeqStartPositions.data(), labelSeqStartPositions.size(), useGpu);
-    }
-  }
-
-  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
-    LayerConfig config;
-    config.set_name(testConf.inputDefs[i].name);
-    config.set_type("data");
-    config.set_size(testConf.inputDefs[i].dim);
-    LayerPtr layer = LayerPtr(new DataLayer(config));
-    size_t numSequence = sequenceStartPositions
-                             ? sequenceStartPositions->getSize() - 1
-                             : batchSize / 10 + 1;
-
-    Argument data;
-    auto fillData = [&](bool trans, int height, int width) {
-      int newHeight = trans ? height : width;
-      int newWidth = trans ? width : height;
-      data.value = Matrix::create(newHeight, newWidth, false, useGpu);
-      data.grad = Matrix::create(newHeight, newWidth, false, useGpu);
-    };
-    switch (testConf.inputDefs[i].inputType) {
-      case INPUT_DATA:
-      case INPUT_SEQUENCE_DATA:
-      case INPUT_HASSUB_SEQUENCE_DATA:
-      case INPUT_DATA_TARGET:
-      case INPUT_SEQUENCE_MDIM_DATA:
-        fillData(trans, layer->getSize(), batchSize);
-        data.value->randomizeUniform();
-        // make sure that multi-class-cross-entry won't encounter negatives
-        // make sure that multi_binary_label satisfies 0~1
-        data.value->add(-0.5);
-        if (testLayerName != "prelu") {
-          data.value->sigmoid(*data.value);
-        }
-        data.grad->zeroMem();
-        break;
-      case INPUT_LABEL:
-      case INPUT_SEQUENCE_LABEL:
-        if (testConf.inputDefs[i].labelInitValue.size() != 0) {
-          const std::vector<int>& labelInitValue =
-              testConf.inputDefs[i].labelInitValue;
-          CHECK_EQ(labelInitValue.size(), batchSize);
-          data.ids = VectorT<int>::create(batchSize, useGpu);
-          data.ids->copyFrom(labelInitValue.data(), batchSize);
-        } else {
-          data.ids = VectorT<int>::create(batchSize, useGpu);
-          // now rand number can be 0 to inputDefs[i].dim
-          data.ids->rand(testConf.inputDefs[i].dim);
-        }
-        break;
-      case INPUT_SPARSE_NON_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(
-            batchSize,
-            layer->getSize(),
-            /* withValue= */ false,
-            useGpu,
-            testConf.inputDefs[i].sparse.equalNnzPerSample);
-        break;
-      case INPUT_SPARSE_FLOAT_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize,
-                                            layer->getSize(),
-                                            /* withValue= */ true,
-                                            useGpu);
-        break;
-      case INPUT_DENSE_DIM_DATA:
-        fillData(trans, layer->getSize(), numSequence);
-        data.value->randomizeUniform();
-        data.value->add(-0.5);
-        data.value->sigmoid(*data.value);
-        data.grad->zeroMem();
-        break;
-      case INPUT_SELF_DEFINE_DATA: {
-        if (testConf.inputDefs[i].ids.size()) {
-          data.ids = IVector::create(testConf.inputDefs[i].ids.size(), useGpu);
-          data.ids->copyFrom(testConf.inputDefs[i].ids.data(),
-                             testConf.inputDefs[i].ids.size());
-        } else if (testConf.inputDefs[i].selfDefinedData) {
-          size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
-          size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
-          CHECK_GT(static_cast<int>(height), 0);
-          CHECK_GT(static_cast<int>(width), 0);
-          data.value = Matrix::create(height, width, false, useGpu);
-          data.grad = Matrix::create(height, width, false, useGpu);
-          data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
-          data.grad->zeroMem();
-        } else {
-          LOG(FATAL) << "No self-defined data are given.";
-          return;
-        }
-
-        const std::vector<int>& labelSeqStartPositions =
-            testConf.inputDefs[i].labelSeqStartPositions;
-        if (labelSeqStartPositions.size() != 0) {
-          CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
-
-          sequenceStartPositions =
-              ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
-          sequenceStartPositions->copyFrom(labelSeqStartPositions.data(),
-                                           labelSeqStartPositions.size(),
-                                           useGpu);
-          data.sequenceStartPositions = sequenceStartPositions;
-        }
-
-        const std::vector<int>& labelSubSeqStartPositions =
-            testConf.inputDefs[i].labelSubSeqStartPositions;
-        if (labelSubSeqStartPositions.size() != 0) {
-          CHECK_GE(static_cast<int>(labelSubSeqStartPositions.size()), 2);
-
-          subSequenceStartPositions =
-              ICpuGpuVector::create(labelSubSeqStartPositions.size(), useGpu);
-          subSequenceStartPositions->copyFrom(labelSubSeqStartPositions.data(),
-                                              labelSubSeqStartPositions.size(),
-                                              useGpu);
-          data.subSequenceStartPositions = subSequenceStartPositions;
-        }
-        break;
-      }
-      default:
-        LOG(FATAL) << " unknown inputType ";
-        return;
-    }
-    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
-        testConf.inputDefs[i].inputType == INPUT_HASSUB_SEQUENCE_DATA ||
-        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL ||
-        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_MDIM_DATA) {
-      if (!sequenceStartPositions) {
-        generateSequenceStartPositions(batchSize, sequenceStartPositions);
-      }
-      data.sequenceStartPositions = sequenceStartPositions;
-    }
-    if (testConf.inputDefs[i].inputType == INPUT_HASSUB_SEQUENCE_DATA) {
-      if (!subSequenceStartPositions) {
-        generateSubSequenceStartPositions(sequenceStartPositions,
-                                          subSequenceStartPositions);
-      }
-      data.subSequenceStartPositions = subSequenceStartPositions;
-    }
-    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_MDIM_DATA) {
-      if (!cpuSequenceDims) {
-        generateMDimSequenceData(sequenceStartPositions, cpuSequenceDims);
-      }
-      data.cpuSequenceDims = cpuSequenceDims;
-    }
-
-    DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-    dataLayer->setData(data);
-    dataLayer->forward(PASS_GC);
-    dataLayers->push_back(dataLayer);
-    (*layerMap)[config.name()] = layer;
-    datas->push_back(data);
-  }
-}
-
-void initTestLayer(TestConfig testConf,
-                   LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters,
-                   LayerPtr* testLayer) {
-  ParameterMap parameterMap;
-  size_t index = 0;
-  LayerConfig testConfig = testConf.layerConfig;
-  CHECK_EQ(testConf.inputDefs.size(),
-           size_t(testConf.layerConfig.inputs_size()));
-
-  auto initParameter = [&](string paraName,
-                           size_t paraSize,
-                           bool isStatic,
-                           bool initialize,
-                           ParameterConfig paraConfig) {
-    paraConfig.set_name(paraName);
-    paraConfig.set_size(paraSize);
-    paraConfig.set_is_static(isStatic);
-    auto para =
-        std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
-    para->enableType(PARAMETER_VALUE);
-    if (!para->isStatic()) {
-      para->enableType(PARAMETER_GRADIENT);
-      para->enableType(PARAMETER_MOMENTUM);
-    }
-    para->randomize();
-    para->setID(index++);
-    parameters->push_back(para);
-    parameterMap[paraConfig.name()] = para;
-  };
-
-  for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
-    InputDef inputDef = testConf.inputDefs[i];
-    size_t paraSize = inputDef.paraSize;
-    bool sparse = inputDef.sparse.sparse;
-    LayerInputConfig& input = *(testConfig.mutable_inputs(i));
-    input.set_input_layer_name(inputDef.name);
-
-    if (paraSize) {
-      constexpr int kParaNameLen = 20;
-      char paraName[kParaNameLen];
-      snprintf(paraName, kParaNameLen, "para_%d", (int)i);
-      input.set_input_parameter_name(paraName);
-      ParameterConfig paraConfig;
-      paraConfig.set_is_sparse(sparse);
-      paraConfig.set_format(inputDef.sparse.format);
-      if (sparse) {
-        paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
-        paraConfig.add_dims(testConf.layerConfig.size());
-      }
-      CHECK_GE(testConf.paramInitialStd, 0);
-      paraConfig.set_initial_mean(testConf.paramInitialMean);
-      paraConfig.set_initial_std(testConf.paramInitialStd);
-      initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
-    }
-  }
-  if (testConf.biasSize) {
-    testConfig.set_bias_parameter_name("bias");
-    ParameterConfig paraConfig;
-    initParameter(testConfig.bias_parameter_name(),
-                  testConf.biasSize,
-                  testConf.staticBias,
-                  true,
-                  paraConfig);
-  }
-
-  *testLayer = Layer::create(testConfig);
-  (*layerMap)[testConfig.name()] = *testLayer;
-  (*testLayer)->init((*layerMap), parameterMap);
-  (*testLayer)->setNeedGradient(true);
-}
-
-void testPerturbParameter(TestConfig testConf,
-                          const MatrixPtr weights,
-                          const LayerStatePtr state,
-                          real cost,
-                          real callbackCount,
-                          real* maxDiff,
-                          LayerPtr testLayer,
-                          std::vector<ParameterPtr>* parameters) {
-  char fill = ' ';
-  for (auto& parameter : *parameters) {
-    if (parameter->isStatic()) {
-      continue;
-    }
-
-    size_t dim = parameter->getSize();
-    CpuVector oldPara(dim);
-    CpuVector newPara(dim);
-    VectorPtr v = parameter->getBuf(PARAMETER_VALUE);
-    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    real* newp = newPara.getData();
-    real* oldp = oldPara.getData();
-    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
-    vector<real> d(dim);
-
-    double delta = genPerturbation(cpuGrad.getData(), &d[0], dim);
-    // use a step such that delta / cost is FLAGS_checkgrad_eps
-    real step =
-        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
-    if (fabs(step) < 1e-6) step = 1e-6;
-    delta *= step;
-
-    // compute newCost
-    real newCost[2];
-    for (int k = 0; k < 2; k++) {
-      for (size_t i = 0; i < dim; ++i) {
-        newp[i] = (k == 0) ? oldp[i] + step * d[i] : oldp[i] - step * d[i];
-      }
-      if (testConf.testBatchState) {
-        testLayer->setState(state);
-      }
-      parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-      parameter->setValueUpdated();
-      newCost[k] = getCostSum(testLayer, weights);
-    }
-    real diff = getDiffAndPrint(newCost[0],
-                                newCost[1],
-                                callbackCount,
-                                fill,
-                                testLayer->getName(),
-                                parameter->getName(),
-                                step,
-                                delta);
-    *maxDiff = std::max(*maxDiff, abs(diff));
-    // restore parameter
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
-    parameter->setValueUpdated();
-    fill = (fill == ' ') ? '.' : ' ';
-  }
-}
-
-void testPerturbInput(TestConfig testConf,
-                      const MatrixPtr weights,
-                      const LayerStatePtr state,
-                      real cost,
-                      real callbackCount,
-                      real* maxDiff,
-                      LayerPtr testLayer,
-                      std::vector<DataLayerPtr> dataLayers) {
-  char fill = ' ';
-  for (size_t index = 0; index < testConf.inputDefs.size(); index++) {
-    InputType inputType = testConf.inputDefs[index].inputType;
-    if (inputType != INPUT_DATA && inputType != INPUT_SEQUENCE_DATA &&
-        inputType != INPUT_HASSUB_SEQUENCE_DATA) {
-      continue;
-    }
-
-    MatrixPtr outV = dataLayers[index]->getOutputValue();
-    int height = outV->getHeight();
-    int width = outV->getWidth();
-    size_t dim = height * width;
-
-    CpuMatrix oldPara(height, width);
-    CpuMatrix newPara(height, width);
-    oldPara.copyFrom(*outV);
-    real* newp = newPara.getData();
-    real* oldp = oldPara.getData();
-    CpuMatrix cpuGrad(height, width);
-    cpuGrad.copyFrom(*(dataLayers[index]->getOutputGrad()));
-    CpuMatrix d(height, width);
-    real* data = d.getData();
-
-    double delta = genPerturbation(cpuGrad.getData(), data, dim);
-    // use a step such that delta / cost is FLAGS_checkgrad_eps
-    real step =
-        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
-    if (fabs(step) < 1e-6) step = 1e-6;
-    delta *= step;
-
-    real newCost[2];
-    for (int k = 0; k < 2; k++) {
-      for (size_t i = 0; i < dim; ++i) {
-        newp[i] =
-            (k == 0) ? oldp[i] + step * data[i] : oldp[i] - step * data[i];
-      }
-      if (testConf.testBatchState) {
-        testLayer->setState(state);
-      }
-      outV->copyFrom(newPara);
-      newCost[k] = getCostSum(testLayer, weights);
-    }
-
-    real diff = getDiffAndPrint(newCost[0],
-                                newCost[1],
-                                callbackCount,
-                                fill,
-                                testLayer->getName(),
-                                dataLayers[index]->getName(),
-                                step,
-                                delta);
-    *maxDiff = std::max(*maxDiff, abs(diff));
-    // restore parameter
-    outV->copyFrom(oldPara);
-    fill = (fill == ' ') ? '.' : ' ';
-  }
-}
-
-void testLayerGradKernel(TestConfig testConf,
-                         string testLayerName,
-                         size_t batchSize,
-                         bool trans,
-                         bool useGpu,
-                         bool useWeight,
-                         float epsilon) {
-#ifndef PADDLE_WITH_CUDA
-  if (useGpu) return;
-#endif
-  FLAGS_use_gpu = useGpu;
-  FLAGS_prev_batch_state = testConf.testBatchState;
-  MatrixPtr weights = nullptr;
-  testConf.layerConfig.set_name(testLayerName);
-  LOG(INFO) << " layer_type=" << testConf.layerConfig.type()
-            << " useGpu=" << useGpu;
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(testConf,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                testLayerName,
-                batchSize,
-                trans,
-                useGpu);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr testLayer;
-  initTestLayer(testConf, &layerMap, &parameters, &testLayer);
-
-  LayerStatePtr state = std::make_shared<LayerState>();
-  if (testConf.testBatchState) {
-    initBatchState(dataLayers[0], testLayer, state, useGpu);
-    testLayer->resetState();
-    testLayer->setState(state);
-  }
-
-  testLayer->forward(PASS_GC);
-  if (useWeight && weights == nullptr) {
-    weights = testLayer->getOutput().value->clone(0, 0, useGpu);
-    initWeight(weights);
-  }
-  std::vector<Argument> outArgs;
-  outArgs.push_back(testLayer->getOutput());
-  if (useWeight) {
-    outArgs[0].value = outArgs[0].value->clone(0, 0, useGpu);
-    outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights);
-  }
-
-  real cost = Argument::sum(outArgs);
-  LOG(INFO) << " cost " << cost;
-  EXPECT_FALSE(std::isnan(cost));
-
-  // Test whether the callback is called for a parameter
-  if (testLayer->getOutputGrad()) {
-    useWeight ? testLayer->getOutput().grad->copyFrom(*weights)
-              : testLayer->getOutputGrad()->resetOne();
-  }
-  vector<int> callbackFlags(parameters.size(), 0);
-  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
-  testLayer->backward(callback);
-
-  // do forward and backward for another time to test that gradient is doubled
-  int callbackCount = 1;
-  if (testConf.testAccumulate) {
-    if (testConf.testBatchState) {
-      testLayer->setState(state);
-    }
-    testLayer->forward(PASS_GC);
-    if (testLayer->getOutputGrad()) {
-      useWeight ? testLayer->getOutput().grad->copyFrom(*weights)
-                : testLayer->getOutputGrad()->resetOne();
-    }
-    testLayer->backward(callback);
-    ++callbackCount;
-  }
-  for (size_t i = 0; i < parameters.size(); ++i) {
-    EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount, callbackFlags[i]);
-  }
-
-  // Test whether the layer's forward calculation is stable
-  // by adding perturbation to its parameters or its input layers
-  real maxDiff = 0;
-  testPerturbParameter(testConf,
-                       weights,
-                       state,
-                       cost,
-                       callbackCount,
-                       &maxDiff,
-                       testLayer,
-                       &parameters);
-  testPerturbInput(testConf,
-                   weights,
-                   state,
-                   cost,
-                   callbackCount,
-                   &maxDiff,
-                   testLayer,
-                   dataLayers);
-  EXPECT_LE(fabs(maxDiff), epsilon);
-
-  if (testConf.testState) {
-    testState(testLayer, dataLayers, datas);
-  }
-  if (testConf.testBatchState) {
-    testBatchState(testLayer, dataLayers, datas);
-  }
-}
-
-void testLayerGrad(TestConfig testConf,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu,
-                   bool useWeight,
-                   float epsilon) {
-  testLayerGradKernel(
-      testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
-  bool isStaticTest = false;
-  LayerConfig testConfig = testConf.layerConfig;
-  for (size_t i = 0; i < testConf.inputDefs.size(); i++) {
-    InputDef inputDef = testConf.inputDefs[i];
-    // Some layer must set isStatic true, like DataNormLayer
-    // so use !isStatic in if
-    if (inputDef.paraSize && (!inputDef.isStatic)) {
-      testConf.inputDefs[i].isStatic = true;
-      isStaticTest = true;
-    }
-  }
-
-  if (testConf.biasSize) {
-    testConf.staticBias = true;
-    isStaticTest = true;
-  }
-  if (isStaticTest) {
-    testLayerGradKernel(
-        testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon);
-  }
-}
-
-void testProjectionGrad(ProjectionConfig conf,
-                        InputType inputType,
-                        size_t parameterSize,
-                        size_t batchSize,
-                        bool useGpu,
-                        bool testState,
-                        int biasSize,
-                        bool sharedBias) {
-  TestConfig config;
-  conf.set_name(conf.type());
-  config.layerConfig.set_type("mixed");
-  config.layerConfig.set_size(conf.output_size());
-  config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
-  config.layerConfig.set_bias_size(config.biasSize);
-  config.layerConfig.set_shared_biases(sharedBias);
-  config.inputDefs.push_back({inputType,
-                              "layer_0",
-                              static_cast<size_t>(conf.input_size()),
-                              parameterSize});
-  *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
-  config.testState = testState;
-  testLayerGrad(config, "mixed", batchSize, false, useGpu);
-}
-
-void testOperatorGrad(TestConfig& config,
-                      OperatorConfig& operatorConf,
-                      size_t batchSize,
-                      bool useGpu,
-                      bool testState) {
-  config.layerConfig.set_type("mixed");
-
-  operatorConf.set_output_size(config.layerConfig.size());
-  for (size_t i = 0; i < config.inputDefs.size(); ++i) {
-    operatorConf.add_input_indices(i);
-    operatorConf.add_input_sizes(config.inputDefs[i].dim);
-  }
-
-  config.testState = testState;
-  testLayerGrad(config, "mixed", batchSize, false, useGpu);
-}
-}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/LayerGradUtil.h b/paddle/legacy/gserver/tests/LayerGradUtil.h
deleted file mode 100644
index 941989a1d..000000000
--- a/paddle/legacy/gserver/tests/LayerGradUtil.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-using namespace std;  // NOLINT
-
-namespace paddle {
-enum InputType {
-  INPUT_DATA,         // dense vector
-  INPUT_LABEL,        // id
-  INPUT_DATA_TARGET,  // dense vector, but no gradient
-  INPUT_SEQUENCE_DATA,
-  INPUT_HASSUB_SEQUENCE_DATA,  // sequence has sub-sequence
-  INPUT_SEQUENCE_MDIM_DATA,
-  INPUT_SEQUENCE_LABEL,
-  INPUT_SPARSE_NON_VALUE_DATA,
-  INPUT_SPARSE_FLOAT_VALUE_DATA,
-  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
-  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
-};
-
-struct ParaSparse {
-  bool sparse;
-  string format;
-  // if equalNnzPerSample is set true,
-  // every row of the sparse matrix in a format of CSR has a same
-  // number of nnz values. Currently, this flag is only used for
-  // selective_fc layer
-  bool equalNnzPerSample;
-  ParaSparse(const string& formatIn = "") {  // NOLINT
-    if (formatIn == "") {
-      sparse = false;
-    } else {
-      sparse = true;
-    }
-    equalNnzPerSample = false;
-  }
-  ParaSparse(const string& formatIn, bool equalNnz) {
-    format = formatIn;
-    sparse = true;
-    equalNnzPerSample = equalNnz;
-  }
-};
-
-struct InputDef {
-  InputType inputType;
-  string name;
-  size_t dim;
-  size_t paraSize;
-  ParaSparse sparse;
-  bool isStatic;
-  std::vector<int> labelInitValue;
-  std::vector<int> labelSeqStartPositions;
-  std::vector<int> labelSubSeqStartPositions;
-  std::vector<int> ids;
-  MatrixPtr selfDefinedData;
-
-  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           MatrixPtr selfDefinedData,
-           std::vector<int> selfDefinedSeqStartPos = {},
-           std::vector<int> selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        selfDefinedData(selfDefinedData) {
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           const std::vector<int>& ids,
-           const std::vector<int>& selfDefinedSeqStartPos = {},
-           const std::vector<int>& selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        ids(ids) {
-    selfDefinedData = nullptr;
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           const std::vector<int>& labelInitValue,
-           const std::vector<int>& labelSeqStartPositions)
-      : labelInitValue(labelInitValue),
-        labelSeqStartPositions(labelSeqStartPositions) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           ParaSparse sparseIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = sparseIn;
-  }
-};
-
-struct TestConfig {
-  LayerConfig layerConfig;
-  std::vector<InputDef> inputDefs;
-  size_t biasSize;
-  real paramInitialMean;
-  real paramInitialStd;
-  bool testAccumulate;
-  bool testState;
-  bool staticBias;
-  bool testBatchState;
-  TestConfig()
-      : biasSize(0),
-        paramInitialMean(0.0),
-        paramInitialStd(1.0),
-        testAccumulate(true),
-        testState(false),
-        staticBias(false),
-        testBatchState(false) {}
-};
-
-real getCostSum(ParameterPtr& parameter,
-                CpuVector& cpuPara,
-                LayerPtr& testLayer,
-                MatrixPtr weights = nullptr);
-
-real getDiffAndPrint(real newCost1,
-                     real newCost2,
-                     real callbackCount,
-                     char fill,
-                     string testLayerName,
-                     string name,
-                     real step,
-                     real delta);
-
-/**
- * @brief verify that sequentially running forward() one timestamp at one time
- *        has same result as running forward() with one whole sequence
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testState(LayerPtr testLayer,
-               vector<DataLayerPtr>& dataLayers,
-               vector<Argument>& datas);
-
-/**
- * @brief verify that sequentially running forward() with short sequences one
- *        time has same result as running forward() with long sequences.
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testBatchState(LayerPtr testLayer,
-                    vector<DataLayerPtr>& dataLayers,
-                    vector<Argument>& datas);
-
-/**
- * @brief Generate a perturbation so that it is roughly aligned with the
- *        gradient direction. This is to make sure that change along this
- *        direction will make cost increase (or decrease) in a meaningful
- *        way so that the finite difference can be used to approximate the
- *        directional dirivative well.
- *
- * @param oldGrad[in]  input gradient
- *        newGrad[out] output gradient
- *        dim          dimension of oldGrad/newGrad
- *
- * @return sum_i(oldGrad[i] * newGrad[i])
- */
-double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
-
-void initWeight(MatrixPtr& weights);
-
-void initBatchState(LayerPtr dataLayer,
-                    LayerPtr testLayer,
-                    LayerStatePtr state,
-                    bool useGpu);
-
-/**
- * @brief initialize the dataLayer by its inputType
- *
- * @param testConf[in]        test config
- *        dataLayers[out]     dataLayers
- *        datas[out]          initialized data of dataLayers
- *        layerMap[out]       layerMap
- */
-void initDataLayer(TestConfig testConf,
-                   std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas,
-                   LayerMap* layerMap,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu);
-
-/**
- * @brief initialize the parameter of testLayer
- *
- * @param testConf[in/out]    test config
- *        layerMap[out]       layerMap
- *        parameters[out]     parameters of testLayer
- *        testLayer[out]      testLayer
- */
-void initTestLayer(TestConfig testConf,
-                   LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters,
-                   LayerPtr* testLayer);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its parameters
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        parameters[in/out]   parameters of testLayer
- */
-void testPerturbParameter(TestConfig testConf,
-                          const MatrixPtr weights,
-                          const LayerStatePtr state,
-                          real cost,
-                          real callbackCount,
-                          real* maxDiff,
-                          LayerPtr testLayer,
-                          std::vector<ParameterPtr>* parameters);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its input layers
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        dataLayers[in/out]   dataLayers
- */
-void testPerturbInput(TestConfig testConf,
-                      const MatrixPtr weights,
-                      const LayerStatePtr state,
-                      real cost,
-                      real callbackCount,
-                      real* maxDiff,
-                      LayerPtr testLayer,
-                      std::vector<DataLayerPtr> dataLayers);
-
-void testLayerGradKernel(TestConfig testConf,
-                         string testLayerName,
-                         size_t batchSize,
-                         bool trans,
-                         bool useGpu,
-                         bool useWeight = false,
-                         float epsilon = 0.02);
-
-void testLayerGrad(TestConfig testConf,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu,
-                   bool useWeight = false,
-                   float epsilon = 0.02);
-
-void testProjectionGrad(ProjectionConfig conf,
-                        InputType inputType,
-                        size_t parameterSize,
-                        size_t batchSize,
-                        bool useGpu,
-                        bool testState = false,
-                        int biasSize = 0,
-                        bool sharedBias = false);
-
-void testOperatorGrad(TestConfig& config,
-                      OperatorConfig& operatorConf,
-                      size_t batchSize,
-                      bool useGpu,
-                      bool testState = false);
-
-}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.cpp b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
deleted file mode 100644
index b550ba9c7..000000000
--- a/paddle/legacy/gserver/tests/MKLDNNTester.cpp
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNTester.h"
-#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
-#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
-#include "paddle/legacy/trainer/Trainer.h"
-
-namespace paddle {
-
-// init data layer and test layer of both dnn and reference
-void MKLDNNTester::reset(const TestConfig& dnn,
-                         const TestConfig& ref,
-                         size_t batchSize) {
-  const bool trans = false;
-  const bool useGpu = false;
-
-  // clear
-  configs_.clear();
-  layerNames_.clear();
-  dataLayers_.clear();
-  datas_.clear();
-  layerMaps_.clear();
-  parameters_.clear();
-  testLayers_.clear();
-
-  // resize
-  configs_.resize(NUM);
-  layerNames_.resize(NUM);
-  dataLayers_.resize(NUM);
-  datas_.resize(NUM);
-  layerMaps_.resize(NUM);
-  parameters_.resize(NUM);
-  testLayers_.resize(NUM);
-
-  // reset configs and layer names
-  configs_[DNN] = dnn;
-  configs_[REF] = ref;
-  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
-  layerNames_[REF] = "reference";  // second is reference layer
-
-  // reset others
-  for (size_t i = 0; i < NUM; ++i) {
-    configs_[i].layerConfig.set_name(layerNames_[i]);
-    initDataLayer(configs_[i],
-                  &(dataLayers_[i]),
-                  &(datas_[i]),
-                  &(layerMaps_[i]),
-                  layerNames_[i],
-                  batchSize,
-                  trans,
-                  useGpu);
-    initTestLayer(
-        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
-  }
-  refLayer_ = testLayers_[REF];
-  dnnLayer_ = testLayers_[DNN];
-  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  setInputImgSize();
-
-  // for comparison with Paddle reference results,
-  // need manually add cpu device output for test
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->addOutputArgument(CPU_DEVICE);
-  }
-}
-
-void MKLDNNTester::setInputImgSize() {
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-      // TODO(TJ): fix me when concat and elewise ready
-      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
-      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
-    }
-  }
-}
-
-// init randome parameters of ref, and copy to mkldnn
-void MKLDNNTester::randomWgtDatas() {
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
-    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    parameters_[REF][i]->randomize();
-    if (isBN && i == 2) {
-      // this param is moving average in batch norm, which must larger than 0
-      real offset = fabs(refValue->getMin()) + 1.0;
-      refValue->add(offset);
-    }
-    dnnValue->copyFrom(*refValue);
-
-    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
-    printVector(dnnValue);
-  }
-}
-
-// random botdata of ref layer and copy same to mkldnn
-void MKLDNNTester::randomBotDatas() {
-  CHECK_EQ(dataLayers_.size(), NUM);
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
-    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
-        *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
-    printMatrix(dataLayers_[REF][i]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::randomTopDiffs() {
-  refLayer_->getOutputGrad()->randomizeUniform();
-  dnnLayer_->getOutput(CPU_DEVICE)
-      .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
-  printMatrix(refLayer_->getOutputGrad());
-}
-
-void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_TESTS) << "Check Forward";
-  printTopDatas();
-  double delta =
-      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
-  EXPECT_LE(fabs(delta), eps_);
-}
-
-void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Data";
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
-    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
-    printMatrix(dnnDiff);
-    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
-    printMatrix(refDiff);
-
-    double delta = compareMatrix(refDiff, dnnDiff);
-    EXPECT_LE(fabs(delta), eps_);
-    if (isBN) {
-      // the other two inputs in batch norm are for moving mean and var
-      // do not have grad to compare
-      break;
-    }
-  }
-}
-
-void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
-  saveWgt(parameters_[DNN], dnnWgts);
-
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->convertWeightsToPaddle();
-  }
-  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
-    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
-                     << parameters_[DNN][i]->getName();
-    printVector(dnn);
-    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
-                     << parameters_[REF][i]->getName();
-    printVector(ref);
-
-    double delta = compareVector(ref, dnn);
-    EXPECT_LE(fabs(delta), eps_);
-  }
-
-  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
-  restoreWgt(dnnWgts, parameters_[DNN]);
-}
-
-void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
-                           vector<VectorPtr>& to) {
-  const bool useGpu = false;
-  to.resize(from.size());
-  for (size_t i = 0; i < to.size(); ++i) {
-    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
-    to[i] = Vector::create(wgt->getSize(), useGpu);
-    to[i]->copyFrom(*wgt);
-  }
-}
-
-void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
-                              vector<ParameterPtr>& to) {
-  CHECK_EQ(from.size(), to.size());
-  for (size_t i = 0; i < from.size(); ++i) {
-    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
-    wgt->copyFrom(*from[i]);
-  }
-}
-
-// clear parameters grad
-void MKLDNNTester::clearWgtDiffs(size_t id) {
-  CHECK_LE(id, parameters_.size());
-  for (size_t n = 0; n < parameters_.size(); ++n) {
-    if (id == n || id == parameters_.size()) {
-      for (size_t i = 0; i < parameters_[n].size(); ++i) {
-        const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
-        if (grad) {
-          grad->zeroMem();
-        }
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearBotDiffs(size_t id) {
-  CHECK_LE(id, dataLayers_.size());
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    if (id == n || id == dataLayers_.size()) {
-      // clear inputs layers of this specific layer
-      for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-        dataLayers_[n][i]->getOutputGrad()->zeroMem();
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearTopDatas(size_t id) {
-  CHECK_LE(id, testLayers_.size());
-  for (size_t i = 0; i < testLayers_.size(); ++i) {
-    if (id == i || id == testLayers_.size()) {
-      testLayers_[i]->getOutputValue()->zeroMem();
-    }
-  }
-}
-
-void MKLDNNTester::printTopDatas() {
-  if (!log_) {
-    return;
-  }
-
-  for (int n = 0; n < NUM; ++n) {
-    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
-                     << " Forward Result: OutputValue";
-    printMatrix(testLayers_[n]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::printMatrix(const MatrixPtr& m) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  m->print(ostr);
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-void MKLDNNTester::printVector(const VectorPtr& v) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  v->print(ostr, v->getSize());
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-double MKLDNNTester::getDelta(const real* refer,
-                              const real* value,
-                              size_t len,
-                              const float failRate,
-                              const float thres) {
-  double delta = 0, sum = 0;
-  int failCnt = 0;
-  const double eps = 1e-5;
-  double maxRatio = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double ref = fabs(refer[i]);
-    double val = fabs(value[i]);
-    double diff = fabs(refer[i] - value[i]);
-    delta += diff;
-    sum += ref;
-    if (ref < eps && val < eps) {  // both values are very small
-      continue;
-    }
-    double ratio = diff / ref;
-    if (ratio > thres) {
-      maxRatio = std::max(maxRatio, ratio);
-      failCnt++;
-    }
-  }
-  EXPECT_FALSE(std::isinf(sum));
-  EXPECT_FALSE(std::isnan(sum));
-  EXPECT_FALSE(std::isnan(delta));
-  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
-                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
-  double res = sum > eps ? delta / sum : eps;
-  return (failCnt / (float)len) > failRate ? maxRatio : res;
-}
-
-double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
-  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
-  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
-}
-
-double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
-  CHECK_EQ(v1->getSize(), v2->getSize());
-  return getDelta(v1->getData(), v2->getData(), v1->getSize());
-}
-
-void MKLDNNTester::runOnce() {
-  // test forward
-  randomBotDatas();
-  dnnLayer_->forward(passType_);
-  refLayer_->forward(passType_);
-  checkForward();
-
-  if (passType_ == PASS_TEST) {
-    return;
-  }
-
-  // test backward
-  // simple updater
-  UpdateCallback updateCallback = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-  randomTopDiffs();
-  dnnLayer_->backward(updateCallback);
-  refLayer_->backward(updateCallback);
-  checkBackwardData();
-  checkBackwardWgts();
-
-  // clear buffers
-  // ref code will addto the diff, dnn code will writeto it
-  // and clearTopDatas(REF) should be coverd by ref layers
-  clearBotDiffs(REF);
-  clearWgtDiffs(REF);
-  // it is necessary to clear bottom diffs when only activation is dnn type
-  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
-    clearBotDiffs(DNN);
-  }
-}
-
-void MKLDNNTester::run(const TestConfig& dnn,
-                       const TestConfig& ref,
-                       size_t batchSize,
-                       size_t inputImgH,
-                       size_t inputImgW,
-                       PassType passType,
-                       bool printDetails,
-                       size_t iter,
-                       float epsilon) {
-  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
-        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
-      << "should be MKLDNN layer or MKLDNN activation";
-  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.active_type() << " vs "
-                       << ref.layerConfig.active_type();
-  } else {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.type() << " vs "
-                       << ref.layerConfig.type();
-  }
-
-  ih_ = inputImgH;
-  iw_ = inputImgW;
-  passType_ = passType;
-  log_ = printDetails;
-  iter_ = iter;
-  eps_ = epsilon;
-
-  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
-  reset(dnn, ref, batchSize);
-  randomWgtDatas();
-  clearWgtDiffs();
-  clearBotDiffs();
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-
-  if (parameters_[DNN].empty()) {
-    // has no paramters
-    return;
-  }
-
-  // After run some iterations, the mkldnn weight has been stored in dnnLayer
-  // and we can also get the mkldnn weight parameter header format.
-  // Weight parameter should always be index 0 (and bias index 1).
-  // TODO(TJ): should also consider mean and var format when batchnorm ready
-  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
-  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
-  if (dnnWgtFmt == refWgtFmt) {
-    // weight format are equal, so no need check more
-    return;
-  }
-
-  // then save the weights and restart again
-  vector<VectorPtr> dnnWgts, refWgts;
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  saveWgt(parameters_[DNN], dnnWgts);
-  saveWgt(parameters_[REF], refWgts);
-
-  // restart again with dnn weight format
-  reset(dnn, ref, batchSize);
-  // TODO(TJ): should also considerate mean and var format when batchnorm ready
-  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
-
-  // restore wgt
-  restoreWgt(dnnWgts, parameters_[DNN]);
-  restoreWgt(refWgts, parameters_[REF]);
-  clearWgtDiffs();
-  clearBotDiffs();
-
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-}
-
-void MKLDNNTester::initArgument(DataIn& data,
-                                const std::string& configPath,
-                                const size_t iter) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-  data.inArgs.resize(iter);
-  data.outGrads.resize(iter);
-  data.paraValues.clear();
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      Argument arg;
-      arg.value = Matrix::create(batchSize, layerSize, false, false);
-      arg.grad = Matrix::create(batchSize, layerSize, false, false);
-      arg.value->randomizeUniform();
-      arg.value->add(-0.5);
-      arg.value->sigmoid(*arg.value);
-      arg.grad->zeroMem();
-      arg.ids = VectorT<int>::create(batchSize, false);
-      arg.ids->rand(layerSize);
-      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-      data.inArgs[i].push_back(arg);
-    }
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
-      grad->randomizeUniform();
-      data.outGrads[i].push_back(grad);
-    }
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), false);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void MKLDNNTester::getOutResult(const std::string& configPath,
-                                DataIn& in,
-                                DataOut& out,
-                                bool use_mkldnn,
-                                size_t iter) {
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = use_mkldnn;
-  *ThreadLocalRand::getSeed() = 1;
-  srand(1);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-  auto gradientMachine = trainer.getGradientMachine();
-  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-  }
-  UpdateCallback simpleUpdate = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-
-  vector<Argument> outArgs;
-  gradientMachine->start();
-  out.outValues.clear();
-  out.paraValues.clear();
-  for (size_t i = 0; i < iter; ++i) {
-    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
-    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
-    // save forward result
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      const MatrixPtr& src = outArgs[k].value;
-      MatrixPtr dst =
-          Matrix::create(src->getHeight(), src->getWidth(), false, false);
-      if (typeid(*src) == typeid(MKLDNNMatrix)) {
-        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
-        dnnSrc->copyTo(*dst);
-      } else {
-        dst->copyFrom(*src);
-      }
-      out.outValues.push_back(dst);
-    }
-
-    // random backward input
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
-    }
-    gradientMachine->backward(simpleUpdate);
-  }
-  gradientMachine->finish();
-
-  // save param value
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr val = Vector::create(
-        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
-    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
-    out.paraValues.push_back(val);
-  }
-}
-
-void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
-  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
-  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
-  for (size_t i = 0; i < ref.outValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
-    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
-  }
-  for (size_t i = 0; i < ref.paraValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
-    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
-  }
-}
-
-void MKLDNNTester::runNetTest(const std::string& configPath,
-                              size_t iter,
-                              float eps) {
-  DataIn in;
-  initArgument(in, configPath, iter);
-  DataOut outCpu, outDnn;
-  VLOG(MKLDNN_TESTS) << "runing cpu network";
-  getOutResult(configPath, in, outCpu, false, iter);
-  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
-  getOutResult(configPath, in, outDnn, true, iter);
-
-  compareResult(outCpu, outDnn, eps);
-}
-
-}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.h b/paddle/legacy/gserver/tests/MKLDNNTester.h
deleted file mode 100644
index 086846ce5..000000000
--- a/paddle/legacy/gserver/tests/MKLDNNTester.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "LayerGradUtil.h"
-#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
-#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
-
-namespace paddle {
-
-/**
- * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
- * refer to paddle original function
- */
-class MKLDNNTester {
-  enum {
-    DNN = 0,  // MKLDNN layer
-    REF = 1,  // Reference layer
-    NUM = 2,  // Number of total
-  };
-
-  struct DataIn {
-    std::vector<std::vector<Argument>> inArgs;
-    std::vector<std::vector<MatrixPtr>> outGrads;
-    std::vector<VectorPtr> paraValues;
-  };
-
-  struct DataOut {
-    std::vector<MatrixPtr> outValues;
-    std::vector<VectorPtr> paraValues;
-  };
-
- protected:
-  std::vector<TestConfig> configs_;
-  vector<string> layerNames_;
-  vector<vector<DataLayerPtr>> dataLayers_;
-  vector<vector<Argument>> datas_;
-  vector<LayerMap> layerMaps_;
-  vector<vector<ParameterPtr>> parameters_;
-  vector<LayerPtr> testLayers_;
-  LayerPtr refLayer_, dnnLayer_;
-
-  /// run some iterations, all the result should pass
-  size_t iter_;
-  /// whether to print out the details
-  bool log_;
-  /// epsilon
-  float eps_;
-  /// input image size, default 1
-  size_t ih_, iw_;
-  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
-  PassType passType_;
-
- public:
-  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
-    iter_ = iter;
-    eps_ = epsilon;
-    log_ = false;
-    passType_ = PASS_TRAIN;
-  }
-
-  ~MKLDNNTester() {}
-
- public:
-  void run(const TestConfig& dnn,
-           const TestConfig& ref,
-           size_t batchSize,
-           size_t inputImgH = 1,
-           size_t inputImgW = 1,
-           PassType passType = PASS_TRAIN,
-           bool printDetails = false,
-           size_t iter = 3,
-           float epsilon = 1e-4);
-  static void runNetTest(const std::string& configPath,
-                         size_t iter = 2,
-                         float eps = 1e-4);
-  static void initArgument(DataIn& data,
-                           const std::string& configPath,
-                           size_t iter = 2);
-  static void getOutResult(const std::string& configPath,
-                           DataIn& in,
-                           DataOut& out,
-                           bool use_mkldnn,
-                           size_t iter = 2);
-
- private:
-  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
-  void setInputImgSize();
-  void runOnce();
-
-  void randomWgtDatas();
-  void randomBotDatas();
-  void randomTopDiffs();
-
-  void checkForward();
-  void checkBackwardData();
-  void checkBackwardWgts();
-
-  // clear specific layer, clear all when id equals NUM
-  void clearWgtDiffs(size_t id = NUM);
-  void clearBotDiffs(size_t id = NUM);
-  void clearTopDatas(size_t id = NUM);
-
-  void printTopDatas();
-  void printMatrix(const MatrixPtr& m);
-  void printVector(const VectorPtr& v);
-
-  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
-  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
-
-  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
-  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
-  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
-
-  /**
-   * Get delta percent
-   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
-   * return the max(diff/ref)
-   * else return sum(abs(diff)) / sum(abs(ref))
-   * The return value should be smaller than eps when passing.
-   */
-  static double getDelta(const real* refer,
-                         const real* value,
-                         size_t len,
-                         const float failRate = 1e-3,
-                         const float thres = 0.1);
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/Sequence/dummy.list b/paddle/legacy/gserver/tests/Sequence/dummy.list
deleted file mode 100644
index 0e52665e1..000000000
--- a/paddle/legacy/gserver/tests/Sequence/dummy.list
+++ /dev/null
@@ -1 +0,0 @@
-dummy_file_no_use
diff --git a/paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict b/paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
deleted file mode 100644
index 41f68e7f5..000000000
--- a/paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
+++ /dev/null
@@ -1,158 +0,0 @@
-，
-的
-。
-酒店
-房间
-了
-很
-也
-不错
-是
-！
-有
-服务
-就是
-都
-住
-一
-在
-好
-月湖
-不
-可以
-.
-且
-就
-离
-方便
-早餐
-还是
-近
-位置
-干净
-床上用品
-、
-价格
-挺
-强烈推荐
-感觉
-卫生
-本来
-挺好
-性价比
-房
-前台
-下次
-交通
-不过
-很方便
-给
-没
-这个
-不少
-还有
-十一
-来
-还会
-停电
-推荐
-流
-服务员
-新
-舒适
-选择
-热情
-简直
-吃饭
-安静
-吃
-很干净
-地理位置
-便利
-得
-这
-子
-杯子
-很多
-周围
-適
-第
-天一广场
-整体
-好吃
-*
-尚可
-品质
-2
-时候
-家
-出差
-又
-较
-便宜
-整洁
-啊
-汉庭
-交通便利
-旁边
-对
-去过
-次
-利落
-合
-换
-窗户
-温馨
-最
-两
-应该
-只有
-适中
-出去玩
-很安静
-商务
-对面
-道歉
-乾
-地铁站
-居然
-不远
-总体来说
-泳池
-地段
-全家
-相对
-晚
-天一阁
-电脑
-來
-呀
-一人
-口头
-上网
-刷牙
-相当
-天
-合理
-准备
-通知
-第一天
-水温
-出来
-五星级
-快
-无
-楼层
-各方面
-华润万家
-宁波
-选
-放心
-浄
-主要原因
-安排
-客户
-一次性杯子
-起
-床垫
-一早
diff --git a/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
deleted file mode 100644
index 2cdf7f7e1..000000000
--- a/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
+++ /dev/null
@@ -1,10 +0,0 @@
-2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
-2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
-2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
-2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
-2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
-2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
-2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
-2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
-2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
-2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
diff --git a/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
deleted file mode 100644
index 3aa890d8a..000000000
--- a/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
+++ /dev/null
@@ -1,14 +0,0 @@
-2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
-2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
-
-2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
-2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
-2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
-
-2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
-2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
-
-2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
-2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
-2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
-
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list b/paddle/legacy/gserver/tests/Sequence/train.list
deleted file mode 100644
index 1109a2449..000000000
--- a/paddle/legacy/gserver/tests/Sequence/train.list
+++ /dev/null
@@ -1 +0,0 @@
-legacy/gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list.nest b/paddle/legacy/gserver/tests/Sequence/train.list.nest
deleted file mode 100644
index a67df3502..000000000
--- a/paddle/legacy/gserver/tests/Sequence/train.list.nest
+++ /dev/null
@@ -1 +0,0 @@
-legacy/gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/legacy/gserver/tests/__init__.py b/paddle/legacy/gserver/tests/__init__.py
deleted file mode 100644
index f662d6826..000000000
--- a/paddle/legacy/gserver/tests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/gserver/tests/concat_dotmul_a.conf b/paddle/legacy/gserver/tests/concat_dotmul_a.conf
deleted file mode 100644
index db02ca7e8..000000000
--- a/paddle/legacy/gserver/tests/concat_dotmul_a.conf
+++ /dev/null
@@ -1,31 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000)
-
-data = data_layer(name ="input", size=1000)
-
-with mixed_layer(size=1000) as layer1:
-    layer1 += dotmul_projection(input=data)
-
-with mixed_layer(size=1000) as layer2:
-    layer2 += dotmul_projection(input=data)
-
-concat = concat_layer(input=[layer1, layer2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_dotmul_b.conf b/paddle/legacy/gserver/tests/concat_dotmul_b.conf
deleted file mode 100644
index 5e64970e4..000000000
--- a/paddle/legacy/gserver/tests/concat_dotmul_b.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000)
-
-data = data_layer(name ="input", size=1000)
-
-proj1 = dotmul_projection(input=data)
-
-proj2 = dotmul_projection(input=data)
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_fullmatrix_a.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
deleted file mode 100644
index 940d1efc5..000000000
--- a/paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
+++ /dev/null
@@ -1,35 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-
-data = data_layer(name ="input", size=100)
-
-# fc1 is equal to fc2
-# note that in mixed_layer, default bias_attr=False,
-# and default act=LinearActivation().
-fc1 = fc_layer(input=data, size=1000, 
-               bias_attr=False, 
-               act=LinearActivation())
-
-with mixed_layer(size=1000) as fc2:
-    fc2 += full_matrix_projection(input=data)
-
-concat = concat_layer(input=[fc1, fc2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_fullmatrix_b.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
deleted file mode 100644
index 931e5b38e..000000000
--- a/paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-
-data = data_layer(name ="input", size=100)
-
-proj1 = full_matrix_projection(input=data, size=1000)
-
-proj2 = full_matrix_projection(input=data, size=1000)
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_slice_a.conf b/paddle/legacy/gserver/tests/concat_slice_a.conf
deleted file mode 100644
index dccf91108..000000000
--- a/paddle/legacy/gserver/tests/concat_slice_a.conf
+++ /dev/null
@@ -1,41 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-
-data = data_layer(name ="input", size=8*16*16)
-
-conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-
-proj1 = slice_projection(input=conv1, slices=[(0, 4), (4, 12)])
-
-proj2 = slice_projection(input=conv2, slices=[(1, 5), (5, 15)])
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
-
diff --git a/paddle/legacy/gserver/tests/concat_slice_b.conf b/paddle/legacy/gserver/tests/concat_slice_b.conf
deleted file mode 100644
index 29686ef28..000000000
--- a/paddle/legacy/gserver/tests/concat_slice_b.conf
+++ /dev/null
@@ -1,41 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-
-data = data_layer(name ="input", size=8*16*16)
-
-conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-
-proj1 = slice_projection(input=conv1, slices=[(0, 12)])
-
-proj2 = slice_projection(input=conv2, slices=[(1, 15)])
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
-
diff --git a/paddle/legacy/gserver/tests/concat_table_a.conf b/paddle/legacy/gserver/tests/concat_table_a.conf
deleted file mode 100644
index 047cb44d1..000000000
--- a/paddle/legacy/gserver/tests/concat_table_a.conf
+++ /dev/null
@@ -1,32 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=300)
-
-data = data_layer(name ="input", size=10000)
-
-# emb1 is equal to emb2, note that bias_attr=false 
-# and act=LinearActivation() in default.
-emb1 = embedding_layer(input=data, size=128)
-
-with mixed_layer(size=128) as emb2:
-    emb2 += table_projection(input=data)
-
-concat = concat_layer(input=[emb1, emb2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/concat_table_b.conf b/paddle/legacy/gserver/tests/concat_table_b.conf
deleted file mode 100644
index c666ab994..000000000
--- a/paddle/legacy/gserver/tests/concat_table_b.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=300)
-
-data = data_layer(name ="input", size=10000)
-
-proj1 = table_projection(input=data, size=128)
-
-proj2 = table_projection(input=data, size=128)
-
-concat = concat_layer(input=[proj1, proj2])
-
-outputs(concat)
diff --git a/paddle/legacy/gserver/tests/img_conv_a.conf b/paddle/legacy/gserver/tests/img_conv_a.conf
deleted file mode 100644
index 3ad15c64f..000000000
--- a/paddle/legacy/gserver/tests/img_conv_a.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                        num_channels=8,
-                        num_filters=16, stride=1,
-                        bias_attr=False,
-                        act=ReluActivation())
-conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation())
-
-concat = concat_layer(input=[conv1, conv2])
-
-conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                      num_channels=8,
-                      num_filters=16, stride=1,
-                      bias_attr=True,
-                      act=LinearActivation(),
-                      groups=2)
-
-outputs(concat, conv)
diff --git a/paddle/legacy/gserver/tests/img_conv_b.conf b/paddle/legacy/gserver/tests/img_conv_b.conf
deleted file mode 100644
index e68008155..000000000
--- a/paddle/legacy/gserver/tests/img_conv_b.conf
+++ /dev/null
@@ -1,32 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-proj1 = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                        num_channels=8, num_filters=16, stride=1)
-proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                        num_channels=8, num_filters=16, stride=1)
-concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
-
-proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8, num_filters=16, stride=1, groups=2)
-
-with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
-    conv += proj
-
-outputs(concat, conv)
diff --git a/paddle/legacy/gserver/tests/img_conv_c.conf b/paddle/legacy/gserver/tests/img_conv_c.conf
deleted file mode 100644
index 4598ffbdb..000000000
--- a/paddle/legacy/gserver/tests/img_conv_c.conf
+++ /dev/null
@@ -1,43 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                        num_channels=8,
-                        num_filters=16, stride=1,
-                        bias_attr=False,
-                        act=ReluActivation(),
-                        layer_type="exconv")
-conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                       num_channels=8,
-                       num_filters=16, stride=1,
-                       bias_attr=False,
-                       act=ReluActivation(),
-                       layer_type="exconv")
-
-concat = concat_layer(input=[conv1, conv2])
-
-conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                      num_channels=8,
-                      num_filters=16, stride=1,
-                      bias_attr=True,
-                      act=LinearActivation(),
-                      groups=2,
-                      layer_type="exconv")
-
-outputs(concat, conv)
diff --git a/paddle/legacy/gserver/tests/img_conv_cudnn.py b/paddle/legacy/gserver/tests/img_conv_cudnn.py
deleted file mode 100644
index fd889ee1c..000000000
--- a/paddle/legacy/gserver/tests/img_conv_cudnn.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name="input", size=8 * 16 * 16)
-conv = img_conv_layer(
-    input=data,
-    filter_size=1,
-    filter_size_y=1,
-    num_channels=8,
-    num_filters=16,
-    stride=1,
-    bias_attr=True,
-    act=LinearActivation(),
-    groups=2,
-    layer_type="cudnn_conv")
-
-outputs(conv)
diff --git a/paddle/legacy/gserver/tests/img_conv_exconv.py b/paddle/legacy/gserver/tests/img_conv_exconv.py
deleted file mode 100644
index 5aca6da5a..000000000
--- a/paddle/legacy/gserver/tests/img_conv_exconv.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name="input", size=8 * 16 * 16)
-conv = img_conv_layer(
-    input=data,
-    filter_size=1,
-    filter_size_y=1,
-    num_channels=8,
-    num_filters=16,
-    stride=1,
-    bias_attr=True,
-    act=LinearActivation(),
-    groups=2,
-    layer_type="exconv")
-
-outputs(conv)
diff --git a/paddle/legacy/gserver/tests/img_pool_a.conf b/paddle/legacy/gserver/tests/img_pool_a.conf
deleted file mode 100644
index afd271055..000000000
--- a/paddle/legacy/gserver/tests/img_pool_a.conf
+++ /dev/null
@@ -1,44 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                      num_channels=8,
-                      num_filters=8,stride=1)
-maxpool = img_pool_layer(input=conv,
-                         pool_size=3,
-                         pool_size_y=5,
-                         num_channels=8,
-                         stride=1,
-                         stride_y=2,
-                         padding=1,
-                         padding_y=2,
-                         pool_type=MaxPooling(),
-)
-avgpool = img_pool_layer(input=conv,
-                         pool_size=3,
-                         pool_size_y=5,
-                         num_channels=8,
-                         stride=1,
-                         stride_y=2,
-                         padding=1,
-                         padding_y=2,
-                         pool_type=AvgPooling(),
-)
-
-outputs([maxpool, avgpool])
diff --git a/paddle/legacy/gserver/tests/img_pool_b.conf b/paddle/legacy/gserver/tests/img_pool_b.conf
deleted file mode 100644
index e8deb9edb..000000000
--- a/paddle/legacy/gserver/tests/img_pool_b.conf
+++ /dev/null
@@ -1,44 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=10)
-data = data_layer(name ="input", size=8*16*16)
-conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                      num_channels=8, num_filters=8, stride=1)
-maxpool = img_pool_layer(input=conv,
-                         pool_size=3,
-                         pool_size_y=5,
-                         num_channels=8,
-                         stride=1,
-                         stride_y=2,
-                         padding=1,
-                         padding_y=2,
-                         pool_type=CudnnMaxPooling(),
-)
-
-avgpool = img_pool_layer(input=conv,
-                         pool_size=3,
-                         pool_size_y=5,
-                         num_channels=8,
-                         stride=1,
-                         stride_y=2,
-                         padding=1,
-                         padding_y=2,
-                         pool_type=CudnnAvgPooling(),
-)
-
-outputs([maxpool, avgpool])
diff --git a/paddle/legacy/gserver/tests/mkldnn_branch_net.conf b/paddle/legacy/gserver/tests/mkldnn_branch_net.conf
deleted file mode 100644
index 8d5146abb..000000000
--- a/paddle/legacy/gserver/tests/mkldnn_branch_net.conf
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=16)
-channels = get_config_arg("channels", int, 2)
-
-def two_conv(input, group_name):
-  out1 = img_conv_layer(input=input,
-              name=group_name+'_conv1_',
-              filter_size=1,
-              num_filters=channels,
-              padding=0,
-              shared_biases=True,
-              act=ReluActivation())
-
-  out2 = img_conv_layer(input=input,
-              name=group_name+'_conv2_',
-              filter_size=3,
-              num_filters=channels,
-              padding=1,
-              shared_biases=True,
-              act=ReluActivation())
-  return out1, out2
-
-def two_conv_bn(input, group_name):
-  out1, out2 = two_conv(input, group_name)
-  out1 = batch_norm_layer(input=out1,
-              name=group_name+'_bn1_',
-              use_global_stats=False,
-              act=ReluActivation())
-
-  out2 = batch_norm_layer(input=out2,
-              name=group_name+'_bn2_',
-              use_global_stats=False,
-              act=ReluActivation())
-  return out1, out2
-
-def two_conv_pool(input, group_name):
-  out1, out2 = two_conv(input, group_name)
-  out1 = img_pool_layer(input=out1,
-              name=group_name+'_pool1_',
-              pool_size=3,
-              stride=2,
-              padding=0,
-              pool_type=MaxPooling())
-
-  out2 = img_pool_layer(input=out2,
-              name=group_name+'_pool2_',
-              pool_size=5,
-              stride=2,
-              padding=1,
-              pool_type=MaxPooling())
-  return out1, out2
-
-def two_fc(input, group_name):
-  out1 = fc_layer(input=input,
-            name=group_name+'_fc1_',
-            size=channels,
-            bias_attr=False,
-            act=LinearActivation())
-
-  out2 = fc_layer(input=input,
-            name=group_name+'_fc2_',
-            size=channels,
-            bias_attr=False,
-            act=LinearActivation())
-  return out1, out2
-
-data = data_layer(name ="input", size=channels*16*16)
-
-tmp = img_conv_layer(input=data,
-            num_channels=channels,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-a1, a2 = two_conv(tmp, 'conv_branch')
-tmp = addto_layer(input=[a1, a2],
-            act=ReluActivation(),
-            bias_attr=False)
-
-tmp = img_pool_layer(input=tmp,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=AvgPooling())
-
-b1, b2 = two_conv_pool(tmp, 'pool_branch')
-tmp = concat_layer(input=[b1, b2])
-
-tmp = img_pool_layer(input=tmp,
-            num_channels=channels*2,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=MaxPooling())
-
-tmp = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            stride=2,
-            shared_biases=True,
-            act=LinearActivation(),
-            bias_attr=False)
-
-tmp = batch_norm_layer(input=tmp,
-            use_global_stats=False,
-            act=ReluActivation())
-
-c1, c2 = two_conv_bn(tmp, 'bn_branch')
-tmp = addto_layer(input=[c1, c2],
-            act=ReluActivation(),
-            bias_attr=False)
-
-tmp = fc_layer(input=tmp, size=channels,
-            bias_attr=True,
-            act=ReluActivation())
-
-d1, d2 = two_fc(tmp, 'fc_branch')
-tmp = addto_layer(input=[d1, d2])
-
-out = fc_layer(input=tmp, size=10,
-            bias_attr=True,
-            act=SoftmaxActivation())
-
-outputs(out)
diff --git a/paddle/legacy/gserver/tests/mkldnn_simple_net.conf b/paddle/legacy/gserver/tests/mkldnn_simple_net.conf
deleted file mode 100644
index 0e9d6b31f..000000000
--- a/paddle/legacy/gserver/tests/mkldnn_simple_net.conf
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=16)
-channels = get_config_arg("channels", int, 2)
-
-data = data_layer(name ="input", size=channels*16*16)
-
-tmp = img_conv_layer(input=data,
-            num_channels=channels,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            shared_biases=True,
-            act=ReluActivation())
-
-tmp = img_pool_layer(input=tmp,
-            pool_size=3,
-            stride=1,
-            padding=0,
-            pool_type=AvgPooling())
-
-tmp = img_conv_layer(input=tmp,
-            filter_size=3,
-            num_filters=channels,
-            padding=1,
-            shared_biases=True,
-            act=LinearActivation(),
-            bias_attr=False)
-
-tmp = batch_norm_layer(input=tmp,
-            use_global_stats=False,
-            act=ReluActivation())
-
-tmp = img_pool_layer(input=tmp,
-            pool_size=3,
-            stride=2,
-            padding=1,
-            pool_type=MaxPooling())
-
-tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75)
-
-tmp = fc_layer(input=tmp,
-            size=channels,
-            bias_attr=False,
-            act=ReluActivation())
-
-out = fc_layer(input=tmp,
-            size=10,
-            bias_attr=True,
-            act=SoftmaxActivation())
-
-outputs(out)
diff --git a/paddle/legacy/gserver/tests/pyDataProvider.py b/paddle/legacy/gserver/tests/pyDataProvider.py
deleted file mode 100644
index 85ea90d6e..000000000
--- a/paddle/legacy/gserver/tests/pyDataProvider.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-import numpy
-import struct
-import traceback
-
-
-def header_creator():
-    ret = ""
-    ret += struct.pack('i', 3)  # slot num
-    ret += struct.pack('i', 1)  # sequence flag
-    ret += struct.pack('i', 0)  # slot0 dense type
-    ret += struct.pack('i', 3)  # slot0 dim
-    ret += struct.pack('i', 1)  # slot1 sparse non value type
-    ret += struct.pack('i', 7)  # slot1 dim
-    ret += struct.pack('i', 3)  # slot2 index type
-    ret += struct.pack('i', 2)  # slot2 dim
-    return ret
-
-
-def dense_value_creator(sample_num):
-    ret = ""
-    ret += struct.pack('i', sample_num)  # slot0 sample num
-    for i in range(sample_num):  # slot0 value
-        ret += struct.pack('f', 1.0)
-        ret += struct.pack('f', 2.0)
-        ret += struct.pack('f', 3.0)
-    return ret
-
-
-def sparse_value_creator(sample_num):
-    ret = ""
-    ret += struct.pack('i', sample_num)  # slot1 sample num
-    for i in range(sample_num):  # slot1 index
-        ret += struct.pack('i', i * 2)
-    ret += struct.pack('i', sample_num * 2)  #slot1 length
-    for i in range(sample_num):  # slot1 value
-        ret += struct.pack('i', 1)
-        ret += struct.pack('i', 2)
-    return ret
-
-
-def index_value_creator(sample_num):
-    ret = ""
-    ret += struct.pack('i', sample_num)  # slot2 sample num
-    for i in range(sample_num):  # slot2 value
-        ret += struct.pack('i', 0)
-    return ret
-
-
-def sequenceStartPositions_creator():
-    ret = ""
-    ret += struct.pack('i', 2)  # slot0 sequence num
-    ret += struct.pack('i', 0)  # slot0 sequence value1
-    ret += struct.pack('i', 1)  # slot0 sequence value2
-    ret += struct.pack('i', 1)  # slot1 sequence num
-    ret += struct.pack('i', 0)  # slot1 sequence value1
-    ret += struct.pack('i', 2)  # slot2 sequence num
-    ret += struct.pack('i', 0)  # slot2 sequence value1
-    ret += struct.pack('i', 1)  # slot2 sequence value2
-    return ret
-
-
-def subSequenceStartPositions_creator():
-    ret = ""
-    ret += struct.pack('i', 3)  # slot0 subsequence num
-    ret += struct.pack('i', 0)  # slot0 subsequence value1
-    ret += struct.pack('i', 1)  # slot0 subsequence value2
-    ret += struct.pack('i', 2)  # slot0 subsequence value3
-    ret += struct.pack('i', 2)  # slot1 subsequence num
-    ret += struct.pack('i', 0)  # slot1 subsequence value1
-    ret += struct.pack('i', 1)  # slot1 subsequence value2
-    ret += struct.pack('i', 3)  # slot2 subsequence num
-    ret += struct.pack('i', 0)  # slot2 subsequence value1
-    ret += struct.pack('i', 1)  # slot2 subsequence value2
-    ret += struct.pack('i', 2)  # slot2 subsequence value3
-    return ret
-
-
-class SimpleDataProvider:
-    def __init__(self, *file_list):
-        self.file_list = file_list
-
-    def shuffle(self):
-        pass
-
-    def reset(self):
-        pass
-
-    def getHeader(self):
-        return header_creator()
-
-    def getNextBatch(self, batch_size):
-        ret = ""
-        ret += struct.pack('i', 2)  # batch size
-        ret += dense_value_creator(2)  # slot0
-        ret += sparse_value_creator(2)  # slot1
-        ret += index_value_creator(2)  # slot2
-        ret += sequenceStartPositions_creator()
-        return ret
-
-
-class SimpleNestDataProvider:
-    def __init__(self, *file_list):
-        self.file_list = file_list
-
-    def shuffle(self):
-        pass
-
-    def reset(self):
-        pass
-
-    def getHeader(self):
-        return header_creator()
-
-    def getNextBatch(self, batch_size):
-        ret = ""
-        ret += struct.pack('i', 2)  # batch size
-        ret += dense_value_creator(4)  # slot0
-        ret += sparse_value_creator(4)  # slot1
-        ret += index_value_creator(4)  # slot2
-        ret += sequenceStartPositions_creator()
-        ret += subSequenceStartPositions_creator()
-        return ret
-
-
-if __name__ == "__main__":
-    # test code
-    data_provider = SimpleDataProvider('./test_batch')
-    print len(data_provider.getHeader())
-    print len(data_provider.getNextBatch(2))
-
-    data_provider = SimpleNestDataProvider('./test_batch')
-    print len(data_provider.getHeader())
-    print len(data_provider.getNextBatch(2))
diff --git a/paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList b/paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
deleted file mode 100644
index e69de29bb..000000000
diff --git a/paddle/legacy/gserver/tests/pyDataProvider/trainer.conf b/paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
deleted file mode 100644
index 7d910df20..000000000
--- a/paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-################################### Data Configuration ###################
-TrainData(PyData(type="py", 
-            files = "./gserver/tests/pyDataProvider/pyDataProviderList", 
-            load_data_module="pyDataProvider", 
-            load_data_object="SimpleDataProvider"))
-
-################################### Algorithm Configuration #############
-Settings(
-    learning_rate_decay_a = 1e-05,
-    learning_rate_decay_b = 1e-06,
-    learning_rate = 0.001,
-    batch_size = 1,
-    algorithm = 'sgd',
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-)
-
-################################### Network Configuration ###############
-Layer(type = "data", name = "input1", size = 3)
-Layer(type = "data", name = "input2", size = 7)
-
-Layer(inputs = [Input("input1", 
-                      decay_rate = 0.12, 
-                      initial_std = 0.02, 
-                      parameter_name = "_layer1_1.w"), 
-                Input("input2", 
-                      decay_rate = 0.12, 
-                      initial_std = 0.02, 
-                      parameter_name = "_layer1_2.w"),
-               ], 
-      name = "layer1", 
-      bias = Bias(parameter_name = "_layer1.bias"), 
-      active_type = "sigmoid", 
-      type = "fc", 
-      size = 100)
-Layer(inputs = [Input("layer1", 
-                      decay_rate = 0.06, 
-                      initial_std = 0.02, 
-                      parameter_name = "_layer2.w")], 
-      name = "layer2", 
-      bias = Bias(parameter_name = "_layer2.bias"), 
-      active_type = "sigmoid", 
-      type = "fc", 
-      size = 100)
-Layer(inputs = [Input("layer2", 
-                      decay_rate = 0.02, 
-                      initial_std = 0.02, 
-                      parameter_name = "_layer_output.w")], 
-      name = "output", 
-      bias = Bias(parameter_name = "_layer_output.bias"), 
-      active_type = "softmax", 
-      type = "fc", 
-      size = 10)
-
-Layer(type = "data", name = "label", size = 1)
-Layer(inputs = [Input("output"), Input("label")], 
-      type = "multi-class-cross-entropy", 
-      name = "cost")
-Inputs("input1", "input2", "label")
-Outputs("cost")
diff --git a/paddle/legacy/gserver/tests/rnn_data_provider.py b/paddle/legacy/gserver/tests/rnn_data_provider.py
deleted file mode 100644
index 18b2191f4..000000000
--- a/paddle/legacy/gserver/tests/rnn_data_provider.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from paddle.trainer.PyDataProvider2 import *
-
-# Note that each config should has an independent provider
-# in current design of PyDataProvider2.
-#######################################################
-data = [
-    [[[1, 3, 2], [4, 5, 2]], 0],
-    [[[0, 2], [2, 5], [0, 1, 2]], 1],
-]
-
-
-# Used for sequence_nest_rnn.conf
-@provider(
-    input_types=[integer_value_sub_sequence(10), integer_value(3)],
-    should_shuffle=False)
-def process_subseq(settings, file_name):
-    for d in data:
-        yield d
-
-
-# Used for sequence_rnn.conf
-@provider(
-    input_types=[integer_value_sequence(10), integer_value(3)],
-    should_shuffle=False)
-def process_seq(settings, file_name):
-    for d in data:
-        seq = []
-        for subseq in d[0]:
-            seq += subseq
-        yield seq, d[1]
-
-
-# Used for sequence_nest_rnn_multi_input.conf
-@provider(
-    input_types=[integer_value_sub_sequence(10), integer_value(3)],
-    should_shuffle=False)
-def process_subseq2(settings, file_name):
-    for d in data:
-        yield d
-
-
-# Used for sequence_rnn_multi_input.conf
-@provider(
-    input_types=[integer_value_sequence(10), integer_value(3)],
-    should_shuffle=False)
-def process_seq2(settings, file_name):
-    for d in data:
-        seq = []
-        for subseq in d[0]:
-            seq += subseq
-        yield seq, d[1]
-
-
-###########################################################
-data2 = [
-    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]], 0],
-    [[[0, 2], [2, 5], [0, 1, 2]], [[1, 5], [4], [2, 3, 6, 1]], 1],
-]
-
-
-# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
-@provider(
-    input_types=[
-        integer_value_sub_sequence(10), integer_value_sub_sequence(10),
-        integer_value(2)
-    ],
-    should_shuffle=False)
-def process_unequalength_subseq(settings, file_name):
-    for d in data2:
-        yield d
-
-
-# Used for sequence_rnn_multi_unequalength_inputs.conf
-@provider(
-    input_types=[
-        integer_value_sequence(10), integer_value_sequence(10), integer_value(2)
-    ],
-    should_shuffle=False)
-def process_unequalength_seq(settings, file_name):
-    for d in data2:
-        words1 = reduce(lambda x, y: x + y, d[0])
-        words2 = reduce(lambda x, y: x + y, d[1])
-        yield words1, words2, d[2]
-
-
-###########################################################
-data3 = [
-    [[[1, 2], [4, 5, 2]], [1, 2], 0],
-    [[[0, 2], [2, 5], [0, 1, 2]], [2, 3, 0], 1],
-]
-
-
-# Used for sequence_nest_mixed_inputs.conf
-@provider(
-    input_types=[
-        integer_value_sub_sequence(10), integer_value_sequence(10),
-        integer_value(2)
-    ],
-    should_shuffle=False)
-def process_mixed(settings, file_name):
-    for d in data3:
-        yield d
diff --git a/paddle/legacy/gserver/tests/sequenceGen.py b/paddle/legacy/gserver/tests/sequenceGen.py
deleted file mode 100644
index d5ec8ac23..000000000
--- a/paddle/legacy/gserver/tests/sequenceGen.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-import os
-import sys
-
-from paddle.trainer.PyDataProvider2 import *
-
-
-def hook(settings, dict_file, **kwargs):
-    settings.word_dict = dict_file
-    settings.input_types = [
-        integer_value_sequence(len(settings.word_dict)), integer_value(3)
-    ]
-    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-
-
-@provider(init_hook=hook, should_shuffle=False)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            label, comment = line.strip().split('\t')
-            label = int(''.join(label.split()))
-            words = comment.split()
-            words = [
-                settings.word_dict[w] for w in words if w in settings.word_dict
-            ]
-            yield words, label
-
-
-## for hierarchical sequence network
-def hook2(settings, dict_file, **kwargs):
-    settings.word_dict = dict_file
-    settings.input_types = [
-        integer_value_sub_sequence(len(settings.word_dict)),
-        integer_value_sequence(3)
-    ]
-    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
-
-
-@provider(init_hook=hook2, should_shuffle=False)
-def process2(settings, file_name):
-    with open(file_name) as fdata:
-        labels = []
-        sentences = []
-        for line in fdata:
-            if (len(line)) > 1:
-                label, comment = line.strip().split('\t')
-                label = int(''.join(label.split()))
-                words = comment.split()
-                words = [
-                    settings.word_dict[w] for w in words
-                    if w in settings.word_dict
-                ]
-                labels.append(label)
-                sentences.append(words)
-            else:
-                yield sentences, labels
-                labels = []
-                sentences = []
diff --git a/paddle/legacy/gserver/tests/sequence_layer_group.conf b/paddle/legacy/gserver/tests/sequence_layer_group.conf
deleted file mode 100644
index ad1b61d58..000000000
--- a/paddle/legacy/gserver/tests/sequence_layer_group.conf
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# (lstm_input + lstm) is equal to lstmemory 
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory_group(
-    input=lstm_input,
-    size=hidden_dim,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_lstm.conf b/paddle/legacy/gserver/tests/sequence_lstm.conf
deleted file mode 100644
index 6ab70e707..000000000
--- a/paddle/legacy/gserver/tests/sequence_lstm.conf
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data,
-    size=word_dim,
-    param_attr=ParamAttr(sparse_update=sparse_update))
-
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory(
-    input=lstm_input,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf b/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
deleted file mode 100644
index 75c36b118..000000000
--- a/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list.nest',
-    test_list=None,
-    module='sequenceGen',
-    obj='process2',
-    args={"dict_file": dict_file})
-
-settings(batch_size=2)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb_group = embedding_layer(input=data, size=word_dim)
-
-
-# (lstm_input + lstm) is equal to lstmemory 
-def lstm_group(lstm_group_input):
-    with mixed_layer(size=hidden_dim * 4) as group_input:
-        group_input += full_matrix_projection(input=lstm_group_input)
-
-    lstm_output = lstmemory_group(
-        input=group_input,
-        name="lstm_group",
-        size=hidden_dim,
-        act=TanhActivation(),
-        gate_act=SigmoidActivation(),
-        state_act=TanhActivation())
-    return lstm_output
-
-
-lstm_nest_group = recurrent_group(
-    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
-# hasSubseq ->(seqlastins) seq
-lstm_last = last_seq(
-    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
-
-# seq ->(expand) hasSubseq
-lstm_expand = expand_layer(
-    input=lstm_last,
-    expand_as=emb_group,
-    expand_level=ExpandLevel.FROM_SEQUENCE)
-
-# hasSubseq ->(average) seq
-lstm_average = pooling_layer(
-    input=lstm_expand,
-    pooling_type=AvgPooling(),
-    agg_level=AggregateLevel.TO_SEQUENCE)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_average)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
deleted file mode 100644
index bc3b22c2a..000000000
--- a/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
+++ /dev/null
@@ -1,74 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y):
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=x)
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" won't work, because recurrent_group only support the input 
-    # sequence type is same as return sequence type.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=SubsequenceInput(emb))
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
deleted file mode 100644
index 165ab2298..000000000
--- a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ /dev/null
@@ -1,76 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(wid, x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y, wid):
-        z = embedding_layer(input=wid, size=word_dim)
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, z, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=[x, wid])
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it, and will report error: In hierachical RNN, all out
-    # links should be from sequences now.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(data), SubsequenceInput(emb)])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 9a48b7f25..000000000
--- a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_subseq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
-def outer_step(x1, x2):
-    index = [0]
-
-    def inner_step(ipt):
-        index[0] += 1
-        i = index[0]
-        outer_mem = memory(name="outer_rnn_state_%d" % i, size=hidden_dim)
-
-        def inner_step_impl(y):
-            inner_mem = memory(
-                name="inner_rnn_state_" + y.name,
-                size=hidden_dim,
-                boot_layer=outer_mem)
-            out = fc_layer(
-                input=[y, inner_mem],
-                size=hidden_dim,
-                act=TanhActivation(),
-                bias_attr=True,
-                name='inner_rnn_state_' + y.name)
-            return out
-
-        encoder = recurrent_group(
-            step=inner_step_impl, name='inner_%d' % i, input=ipt)
-        last = last_seq(name="outer_rnn_state_%d" % i, input=encoder)
-        return encoder, last
-
-    encoder1, sentence_last_state1 = inner_step(ipt=x1)
-    encoder2, sentence_last_state2 = inner_step(ipt=x2)
-
-    encoder1_expand = expand_layer(
-        input=sentence_last_state1, expand_as=encoder2)
-
-    return [encoder1_expand, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
-    targetInlink=emb2)
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent.py b/paddle/legacy/gserver/tests/sequence_recurrent.py
deleted file mode 100644
index e2c6a7935..000000000
--- a/paddle/legacy/gserver/tests/sequence_recurrent.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent_group.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent_group.py b/paddle/legacy/gserver/tests/sequence_recurrent_group.py
deleted file mode 100644
index b4638bd90..000000000
--- a/paddle/legacy/gserver/tests/sequence_recurrent_group.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    with mixed_layer(
-            name="rnn_state",
-            size=hidden_dim,
-            bias_attr=False,
-            act=SoftmaxActivation()) as out:
-        out += identity_projection(input=y)
-        out += full_matrix_projection(
-            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
-    return out
-
-
-recurrent = recurrent_group(name="rnn", step=step, input=emb)
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn.conf b/paddle/legacy/gserver/tests/sequence_rnn.conf
deleted file mode 100644
index 3133595c9..000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn.conf
+++ /dev/null
@@ -1,57 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=emb)
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
deleted file mode 100644
index 921cef04d..000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-subseq = embedding_layer(input=data1, size=word_dim)
-seq = embedding_layer(input=data2, size=word_dim)
-nonseq = embedding_layer(input=label, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_mixed_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(subseq, seq, nonseq):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner', input=[subseq, seq, nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[
-        subseq, expand_layer(
-            seq, expand_as=subseq,
-            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
-                nonseq,
-                expand_as=subseq,
-                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
-        StaticInput(encoding)
-    ])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
deleted file mode 100644
index c7bcaf6c4..000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_matched_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(data1, data2, label):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        subseq = embedding_layer(input=data1, size=word_dim)
-        seq = embedding_layer(input=data2, size=word_dim)
-        nonseq = embedding_layer(input=label, size=word_dim)
-
-        print_layer(input=[data1, seq, label, inner_mem])
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner',
-        input=[subseq, StaticInput(seq), nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
deleted file mode 100644
index bf4be779a..000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
+++ /dev/null
@@ -1,58 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y, wid):
-    z = embedding_layer(input=wid, size=word_dim)
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, z, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=[emb, data])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 3612b49c2..000000000
--- a/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='legacy/gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_seq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the RNN in
-# sequence_nest_rnn_multi_unequalength_inputs.conf
-
-
-def step(x1, x2):
-    def calrnn(y):
-        mem = memory(name='rnn_state_' + y.name, size=hidden_dim)
-        out = fc_layer(
-            input=[y, mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='rnn_state_' + y.name)
-        return out
-
-    encoder1 = calrnn(x1)
-    encoder2 = calrnn(x2)
-    return [encoder1, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="stepout", step=step, input=[emb1, emb2])
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/test_ActivationGrad.cpp b/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
deleted file mode 100644
index f468d229a..000000000
--- a/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-void testActivation(const string& act) {
-  LOG(INFO) << "test activation: " << act;
-  size_t size = 10;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type(act);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  act + "_activation",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(Activation, activation) {
-  auto types = ActivationFunction::getAllRegisteredTypes();
-  std::set<string> excluded{"sequence_softmax"};
-  for (auto type : types) {
-    if (excluded.count(type)) continue;
-    testActivation(type);
-  }
-}
-
-void testSequenceSoftmaxAct(bool hasSubseq) {
-  LOG(INFO) << "test activation: sequence softmax";
-
-  const size_t size = 1;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sequence_softmax");
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       1,
-       0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sequence_softmax",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(SequenceSoftmaxActivation, activation) {
-  for (auto hasSubseq : {false, true}) {
-    LOG(INFO) << "hasSubseq = " << hasSubseq;
-    testSequenceSoftmaxAct(hasSubseq);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_BatchNorm.cpp b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
deleted file mode 100644
index e21fa1607..000000000
--- a/paddle/legacy/gserver/tests/test_BatchNorm.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/legacy/cuda/include/hl_batch_norm.h"
-#include "paddle/legacy/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the batchNormLayer can be followed by a ConvLayer
-TEST(Layer, batchNorm) {
-  FLAGS_use_gpu = false;
-  TestConfig configBN;
-  const int CHANNELS = 6272;
-  const int IMG_SIZE = 1;
-  configBN.layerConfig.set_type("batch_norm");
-  configBN.layerConfig.set_name("bn");
-  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
-  configBN.layerConfig.set_active_type("relu");
-  configBN.biasSize = CHANNELS;
-  configBN.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
-                                /* paraSize= */ CHANNELS});
-
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 64;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(64);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
-  input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(5);
-  conv->set_filter_size_y(5);
-  conv->set_channels(128);
-  conv->set_padding(1);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(7);
-  conv->set_output_x(3);
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(configBN,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "batch_norm",
-                100,
-                false,
-                false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr bnLayer;
-  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
-
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters2, &convLayer);
-
-  bnLayer->forward(PASS_GC);
-  convLayer->forward(PASS_GC);
-
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void batchNormInference(int n, int c, int h, int w) {
-  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  input->randomizeUniform();
-  cudnnOut->zeroMem();
-  cudaOut->zeroMem();
-
-  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
-  scale->randomizeUniform();
-  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
-  bias->randomizeUniform();
-
-  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
-  movingMean->randomizeUniform();
-
-  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
-  movingVar->randomizeUniform();
-  movingVar->clip(0.01, 50);
-
-  hl_tensor_descriptor ioDesc;
-  hl_tensor_descriptor bnDesc;
-  hl_create_tensor_descriptor(&ioDesc);
-  hl_create_tensor_descriptor(&bnDesc);
-  hl_tensor_reshape(ioDesc, n, c, h, w);
-  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
-
-  double EPS = 1E-5;
-  hl_batch_norm_forward_inference(ioDesc,
-                                  input->getData(),
-                                  ioDesc,
-                                  cudnnOut->getData(),
-                                  bnDesc,
-                                  scale->getData(),
-                                  bias->getData(),
-                                  movingMean->getData(),
-                                  movingVar->getData(),
-                                  EPS);
-
-  hl_batch_norm_cuda_inference(input->getData(),
-                               cudaOut->getData(),
-                               scale->getData(),
-                               bias->getData(),
-                               movingMean->getData(),
-                               movingVar->getData(),
-                               EPS,
-                               n,
-                               c,
-                               h,
-                               w);
-
-  cudnnCheck->copyFrom(*cudnnOut);
-  cudaCheck->copyFrom(*cudaOut);
-  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
-
-  hl_destroy_tensor_descriptor(ioDesc);
-  hl_destroy_tensor_descriptor(bnDesc);
-}
-
-TEST(BatchNorm, Inference) {
-  batchNormInference(33, 267, 1, 1);
-  batchNormInference(19, 105, 4, 4);
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp b/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
deleted file mode 100644
index 1dafd1de4..000000000
--- a/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-// log(exp(x) + exp(y))
-static inline real logSum(real x, real y) {
-  real maxValue = std::max(x, y);
-  if (std::isinf(maxValue)) {
-    return -std::numeric_limits<real>::infinity();
-  } else {
-    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
-  }
-}
-
-static inline std::vector<int> genRandLabels(int numClasses, int length) {
-  std::vector<int> labels(length);
-  for (int i = 0; i < length; ++i) {
-    labels[i] = rand() % numClasses;  // NOLINT
-  }
-  return labels;
-}
-
-TEST(CRFLayer, cost) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-
-      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
-
-      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
-
-      real logZ = -std::numeric_limits<real>::infinity();
-      real logNominator = -std::numeric_limits<real>::infinity();
-      std::vector<int> testResult(length, 0);
-      do {
-        real score = a[testResult.front()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        score += b[testResult.back()];
-        logZ = logSum(logZ, score);
-
-        if (goldenLabels == testResult) {
-          logNominator = score;
-        }
-      } while (getNextSequence(testResult, numClasses));
-
-      real trueCost = -logNominator + logZ;
-
-      real diff = fabs(trueCost - cost);
-      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
-      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
-              << std::endl;
-      if (typeid(real) == typeid(double)) {  // NOLINT
-        EXPECT_LE(diff, 1e-10);
-      } else {
-        EXPECT_LE(diff, 5e-3);
-      }
-    }
-  }
-}
-
-inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
-
-TestConfig initTestConfig(size_t numClasses, bool withWeight) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(numClasses);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              numClasses,
-                              numClasses * (numClasses + 2)});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
-  config.layerConfig.add_inputs();
-
-  if (withWeight) {
-    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
-    config.layerConfig.add_inputs();
-  }
-
-  return config;
-}
-
-TEST(Layer, CRFLayer) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-TEST(Layer, CRFLayerUseWeight) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_CompareSparse.cpp b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
deleted file mode 100644
index 11b633a58..000000000
--- a/paddle/legacy/gserver/tests/test_CompareSparse.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/PythonUtil.h>
-
-#include "paddle/legacy/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/pserver/ParameterServer2.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 = "legacy/gserver/tests/sequence_lstm.conf";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_int32(seed);
-DECLARE_int32(num_passes);
-DECLARE_int32(saving_period);
-
-DECLARE_int32(num_gradient_servers);
-DECLARE_int32(port);
-DECLARE_bool(local);
-DECLARE_bool(use_old_updater);
-DECLARE_bool(parallel_nn);
-DECLARE_string(config_args);
-DEFINE_double(max_diff_ratio,
-              0.0f,
-              "max diff ratio allowed for parameters value");
-
-int gNumDevices = 0;
-
-std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
-                                             bool sparseUpdate,
-                                             int trainerCount = 1,
-                                             bool useGpu = false) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
-
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
-  srand(FLAGS_seed);
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  ThreadLocalRandomEngine::get().seed(FLAGS_seed);
-  if (useGpu) {
-    CHECK_LE(trainerCount, gNumDevices);
-  }
-
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
-  if (!FLAGS_local) {
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    pservers.resize(numPorts);
-
-    for (int i = 0; i < numPorts; ++i) {
-      pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      pservers[i]->init();
-      pservers[i]->start();
-    }
-  }
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-  trainer.train();
-  return trainer.getGradientMachine()->getParameters();
-}
-
-std::vector<ParameterPtr>& getDenseParameters() {
-  static std::vector<ParameterPtr> denseParameters;
-  if (denseParameters.empty()) {
-    // use dense training as base
-    FLAGS_local = true;
-    denseParameters = trainerOnePassTest(configFile1, false);
-  }
-
-  return denseParameters;
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 double maxDiffRatio) {
-  double maxDiff = 0;
-  double maxValue = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double diff = fabs(A[i] - B[i]);
-    maxValue = std::max<double>(maxValue, std::max(fabs(A[i]), fabs(B[i])));
-    maxDiff = std::max(maxDiff, diff);
-  }
-  EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
-  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
-            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
-}
-
-void compareValue(const vector<ParameterPtr>& parametersA,
-                  const vector<ParameterPtr>& parametersB,
-                  double maxDiffRatio = 0.0) {
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "para_A",
-                paraB.getData(),
-                "para_B",
-                paraA.getSize(),
-                maxDiffRatio);
-  }
-}
-
-TEST(compareSparse, cpu) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, remote_cpu) {
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, cpu10_local_vs_remote) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> localParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> remoteParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  compareValue(localParameters, remoteParameters);
-}
-
-TEST(compareSparse, multiGradientMachine) {
-  int numGpu;
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  numGpu = hl_get_device_count();
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = useGpu ? numGpu : 2;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, eps);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-TEST(compareSparse, NeuralNetwork) {
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = 1;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, useGpu ? eps : 0);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-int main(int argc, char** argv) {
-  // FIXME(tonyyang-svail):
-  //   Turn off this test due CI failure:
-  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
-  return 0;
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  gNumDevices = hl_get_device_count();
-  FLAGS_num_passes = 1;          // train one pass
-  FLAGS_saving_period = 100000;  // do not save parameter
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
deleted file mode 100644
index e19c34abb..000000000
--- a/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/legacy/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-
-DECLARE_string(config);
-DECLARE_string(nics);
-
-DEFINE_bool(need_high_accuracy,
-            false,
-            "whether need to run in double accuracy");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_int32(seed);
-
-static const string& config_file_a =
-    "legacy/gserver/tests/sequence_recurrent.py";
-static const string& config_file_b =
-    "legacy/gserver/tests/sequence_recurrent_group.py";
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-
-  FLAGS_nics = "";
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  srand(FLAGS_seed);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-
-  data.parameters = trainer.getGradientMachine()->getParameters();
-
-  DataBatch dataBatch;
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-
-  trainer.getDataProvider()->reset();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  vector<Argument>& inArgs = dataBatch.getStreams();
-
-  trainer.getGradientMachine()->start();
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &data.outArgs, PASS_TRAIN);
-
-  trainer.getGradientMachine()->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  real maxVal = 0;
-  for (size_t i = 0; i < len; ++i) {
-    maxVal = std::max(maxVal, std::max(A[i], B[i]));
-  }
-  real maxDiff = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    maxDiff = std::max(maxDiff, diff);
-    if (diff > maxVal * FLAGS_max_diff_ratio) {
-      nNum++;
-      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
-              << desB << " : " << B[i] << " diff=" << diff;
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "maxValue=" << maxVal << " maxDiff=" << maxDiff << "\n\n";
-}
-
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, config_file_a);
-  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
-
-  ComData dataB;
-  calcGradient(dataB, config_file_b);
-  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-int main(int argc, char** argv) {
-  FLAGS_thread_local_rand_use_global_seed = true;
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-5;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-10;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/legacy/gserver/tests/test_ConvTrans.cpp b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
deleted file mode 100644
index 4ea0a3d37..000000000
--- a/paddle/legacy/gserver/tests/test_ConvTrans.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the convTrans forward is the same as conv backward
-TEST(Layer, convTransLayerFwd) {
-  // Setting up conv-trans layer
-  TestConfig configt;
-  configt.biasSize = 3;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(3);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->forward(PASS_GC);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
-  input = config.layerConfig.add_inputs();
-  conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers2;
-  LayerMap layerMap2;
-  vector<Argument> datas2;
-  initDataLayer(
-      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
-
-  // Sync convLayer and convtLayer parameter
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
-
-  // Set convLayer outputGrad as convTransLayer input value
-  convLayer->forward(PASS_GC);
-  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
-
-  vector<int> callbackFlags(parameters2.size(), 0);
-  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
-  convLayer->backward(callback);
-
-  // Check that the convLayer backward is the same as convTransLayer forward
-  checkMatrixEqual(convtLayer->getOutputValue(),
-                   dataLayers2[0]->getOutputGrad());
-}
-
-// Do one forward pass of convTrans layer and check to see if its output
-// matches the given result
-void doOneConvtTest(size_t imgSize,
-                    size_t output_x,
-                    size_t stride,
-                    size_t padding,
-                    size_t filter_size,
-                    MatrixPtr& result) {
-  TestConfig configt;
-  configt.biasSize = 1;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(1);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(1);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(1);
-  conv->set_filter_channels(1);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->add(1.0);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->getParameters()[0]->zeroMem();
-  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
-  convtLayer->forward(PASS_GC);
-
-  checkMatrixEqual(convtLayer->getOutputValue(), result);
-}
-
-TEST(Layer, convTransLayerFwd2) {
-  MatrixPtr result;
-  result = Matrix::create(1, 5 * 5, false, false);
-  result->zeroMem();
-  result->add(1.0);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 1,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                       4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 4,
-                 result);
-
-  real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData2);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 1,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
-                        2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
-  result->setData(resultData3);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 0,
-                 /* filter_size */ 3,
-                 result);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_ConvUnify.cpp b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
deleted file mode 100644
index d4ca15835..000000000
--- a/paddle/legacy/gserver/tests/test_ConvUnify.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Do one forward pass of ConvLayer using either exconv or cudnn_conv
-MatrixPtr doOneConvTest(size_t imgSize,
-                        size_t output_x,
-                        size_t stride,
-                        size_t padding,
-                        size_t filter_size,
-                        size_t channel,
-                        size_t numfilters,
-                        size_t groups,
-                        MatrixPtr& inputData,
-                        real* param,
-                        bool useGpu,
-                        bool isDeconv = false) {
-  TestConfig config;
-  config.biasSize = numfilters;
-  string layerType;
-  if (useGpu) {
-    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
-  } else {
-    layerType = (isDeconv) ? "exconvt" : "exconv";
-  }
-  config.layerConfig.set_type(layerType);
-  config.layerConfig.set_num_filters(numfilters);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  size_t weightSize = channel * filter_size * filter_size *
-                      config.layerConfig.num_filters() / groups;
-  if (isDeconv) {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
-    config.layerConfig.set_size(imgSize * imgSize *
-                                config.layerConfig.num_filters());
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
-    config.layerConfig.set_size(output_x * output_x *
-                                config.layerConfig.num_filters());
-  }
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(channel);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(groups);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  if (isDeconv) {
-    conv->set_filter_channels(numfilters / groups);
-  } else {
-    conv->set_filter_channels(channel / groups);
-  }
-
-  config.layerConfig.set_name("conv");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters, &convLayer);
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(param, weightSize);
-  convLayer->forward(PASS_GC);
-
-  return convLayer->getOutputValue();
-}
-
-TEST(Layer, convParaUnified) {
-#ifdef PADDLE_WITH_CUDA
-  MatrixPtr input, resultCpu, resultGpu;
-
-  /// TEST1 for conv ///
-  input = Matrix::create(1, 4 * 4, false, false);
-  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
-  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST1 for deconv ///
-  input = Matrix::create(1, 2 * 2, false, false);
-  real inputDataT[] = {1, 2, 3, 4};
-  input->setData(inputDataT);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for conv ///
-  input = Matrix::create(1, 3 * 3 * 2, false, false);
-  real inputData2[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
-  real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData2);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for conv ///
-  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for deconv ///
-  input = Matrix::create(1, 2 * 2 * 2, false, false);
-  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
-  input->setData(inputData2T);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for deconv ///
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-#endif
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
deleted file mode 100644
index 34eb0dedf..000000000
--- a/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-#include <sstream>
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const size_t MAX_SEQ_NUM = 23;
-const size_t MAX_SEQ_LEN = 50;
-const size_t MAX_BEAM_SIZE = 27;
-
-const size_t SEED = (size_t)(time(NULL));
-
-struct SingleBeamExpansion {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<real> candidateScores;
-
-  // TODO(caoying): store this into Argument.ids
-  vector<real> selectedIndices;
-
-  vector<int> groundTruth;
-  vector<size_t> inBeam;
-  vector<int> rowIdxInBeam;
-  vector<int> colIdxInBeam;
-
-  void resetGroundTruth(size_t n) {
-    groundTruth.clear();
-    groundTruth.resize(n, -1);
-
-    inBeam.clear();
-    inBeam.resize(n, 0);
-
-    rowIdxInBeam.clear();
-    rowIdxInBeam.resize(n, -1);
-
-    colIdxInBeam.clear();
-    colIdxInBeam.resize(n, -1);
-  }
-};
-
-inline float randFloat() {
-  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
-}
-
-void genRand(real* numbers, size_t n) {
-  default_random_engine generator;
-  uniform_real_distribution<real> distribution(0.0, 1.0);
-  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genCandidateScores(bool hasSubseq,
-                        size_t beamSize,
-                        SingleBeamExpansion& prevBeam,
-                        SingleBeamExpansion& curBeam) {
-  vector<int>& seqStartPos = curBeam.seqStartPos;
-  seqStartPos.resize(1, 0);
-  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  if (prevBeam.selectedIndices.size()) {
-    if (prevBeam.subSeqStartPos.size() > 1) {
-      int seqIdx = 1;
-      // samples in previous beam are nested sequences.
-      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
-        for (size_t j = 0; j < beamSize; ++j) {
-          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
-          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
-                                   subSeqStartPos.back());
-        }
-        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          seqIdx++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
-        if (i && i % beamSize == 0) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          if (i == prevBeam.selectedIndices.size()) break;
-        }
-        if (prevBeam.selectedIndices[i] == -1.) continue;
-        subSeqStartPos.push_back(subSeqStartPos.back() +
-                                 (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  } else {
-    // the first beam expansion
-    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int i = 0; i < seqNum; ++i) {
-      if (hasSubseq) {
-        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
-          subSeqStartPos.push_back(subSeqStartPos.back() +
-                                   (1 + (rand() % MAX_SEQ_LEN)));
-        seqStartPos.push_back(subSeqStartPos.back());
-      } else {
-        seqStartPos.push_back(seqStartPos.back() +
-                              (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  }
-
-  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
-  curBeam.candidateScores.resize(totalSeqNum, 0.);
-  genRand(curBeam.candidateScores.data(), totalSeqNum);
-}
-
-void genSelectedIndices(size_t beamSize,
-                        vector<int>& seqStartPos,
-                        vector<real>& selectedIndices) {
-  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
-  selectedIndices.resize(selectedIdsCount, -1.);
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    int n = min(seqLen, static_cast<int>(beamSize));
-    vector<real> ids = randSampling(seqLen, n);
-    memcpy(selectedIndices.data() + i * beamSize,
-           ids.data(),
-           sizeof(real) * ids.size());
-  }
-}
-
-void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
-                    size_t beamSize) {
-  SingleBeamExpansion& beam = beamExpansions[1];
-  size_t seqNum = beam.seqStartPos.size() - 1;
-  for (size_t i = 2; i < beamExpansions.size(); ++i)
-    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
-
-  srand(SEED);
-
-  // initialize the first beam.
-  beam.resetGroundTruth(seqNum);
-  for (size_t i = 0; i < seqNum; ++i) {
-    if (randFloat() > 0.5) {
-      /*
-       * force the randomly generated label falls in the beam by chance 0.5.
-       * otherwise, when sequence length is relatively long and beam size is
-       * relatively small, the gold sequences falls off the beam at in the
-       * first search.
-       */
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      beam.colIdxInBeam[i] =
-          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
-            return val != -1.;
-          });
-      beam.groundTruth[i] =
-          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
-      beam.inBeam[i] = 1;
-    } else {
-      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
-      beam.groundTruth[i] = label;
-
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      real* endPos = begPos + beamSize;
-      real* lblPos = find(begPos, endPos, real(label));
-      if (lblPos != endPos) {
-        beam.inBeam[i] = 1;
-        beam.colIdxInBeam[i] = lblPos - begPos;
-      }
-    }
-    beam.rowIdxInBeam[i] = i;
-  }
-
-  // iterate over each beam expansions
-  for (size_t i = 2; i < beamExpansions.size(); ++i) {
-    SingleBeamExpansion& curBeam = beamExpansions[i];
-    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
-    curBeam.resetGroundTruth(seqNum);
-
-    // iterate over each sequence
-    for (size_t j = 0; j < seqNum; ++j) {
-      if (!prevBeam.inBeam[j]) continue;
-
-      // gold sequence falls in the beam in previous search.
-      real* begPos = prevBeam.selectedIndices.data();
-      int offset =
-          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
-      curBeam.rowIdxInBeam[j] = count_if(
-          begPos, begPos + offset, [](const real& val) { return val != -1.; });
-
-      if (randFloat() > 0.5) {
-        // force the randomly generated label falls in the beam by chance 0.5.
-
-        real* start =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
-                  return val != -1.;
-                });
-        curBeam.colIdxInBeam[j] = n;
-        curBeam.groundTruth[j] = *(start + n);
-        curBeam.inBeam[j] = 1;
-      } else {
-        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
-                 curBeam.subSeqStartPos.size() - 1);
-        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
-        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
-        CHECK_GT(size_t(end), size_t(start));
-        int label = rand() % (end - start);
-
-        curBeam.groundTruth[j] = label;
-        real* findBeg =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        real* lblPos =
-            find(findBeg, findBeg + beamSize, static_cast<real>(label));
-        if (lblPos != (findBeg + beamSize)) {
-          curBeam.inBeam[j] = 1;
-          curBeam.colIdxInBeam[j] = lblPos - findBeg;
-        }
-      }
-    }
-  }
-}
-
-void genOneBeam(size_t beamSize,
-                bool hasSubseq,
-                SingleBeamExpansion& prevBeam,
-                SingleBeamExpansion& curBeam) {
-  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
-  genSelectedIndices(beamSize,
-                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
-                     curBeam.selectedIndices);
-}
-
-void genRandomBeamExpansion(size_t expansionCount,
-                            size_t beamSize,
-                            vector<SingleBeamExpansion>& beamExpansions) {
-  beamExpansions.clear();
-  beamExpansions.resize(expansionCount + 1);
-
-  // beamExpansions[0] is reserved.
-  for (size_t i = 1; i <= expansionCount; ++i)
-    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
-  genGroundTruth(beamExpansions, beamSize);
-}
-
-void testCrossEntropyOverBeam(bool useGpu,
-                              size_t beamSize,
-                              vector<SingleBeamExpansion>& beams) {
-  TestConfig config;
-  config.layerConfig.set_type("cross_entropy_over_beam");
-
-  size_t seqNum = 0;
-  for (size_t i = 1; i < beams.size(); ++i) {
-    const SingleBeamExpansion& beam = beams[i];
-    // create scores for all the candidates
-    MatrixPtr candidateScorePtr =
-        Matrix::create(beam.candidateScores.size(), 1, false, false);
-    candidateScorePtr->copyFrom(beam.candidateScores.data(),
-                                beam.candidateScores.size());
-
-    ostringstream paramName;
-    paramName << "candidate_scores_" << i;
-
-    if (beam.subSeqStartPos.size() > 1) {
-      seqNum = beam.subSeqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos,
-                                  beam.subSeqStartPos});
-    } else {
-      seqNum = beam.seqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos});
-    }
-    config.layerConfig.add_inputs();
-
-    // create indices for the selected candidates
-    MatrixPtr selectedCandidates =
-        Matrix::create(seqNum, beamSize, false, false);
-    selectedCandidates->copyFrom(beam.selectedIndices.data(),
-                                 beam.selectedIndices.size());
-    paramName.clear();
-    paramName << "selected_candidates_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
-    config.layerConfig.add_inputs();
-
-    // create the ground truth
-    paramName.clear();
-    paramName << "label_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
-    config.layerConfig.add_inputs();
-  }
-
-  testLayerGrad(
-      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
-}
-
-TEST(Layer, CrossEntropyOverBeam) {
-  LOG(INFO) << "SEED = " << SEED;
-  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
-  LOG(INFO) << "beamSize = " << beamSize;
-
-  // TODO(caoying): test with random beam expansions.
-  const size_t expansionCount = 3;
-  vector<SingleBeamExpansion> beams;
-  genRandomBeamExpansion(expansionCount, beamSize, beams);
-
-  for (bool useGpu : {false, true})
-    testCrossEntropyOverBeam(useGpu, beamSize, beams);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(SEED);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_DetectionOutput.cpp b/paddle/legacy/gserver/tests/test_DetectionOutput.cpp
deleted file mode 100644
index 486521426..000000000
--- a/paddle/legacy/gserver/tests/test_DetectionOutput.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-// Do one forward pass of priorBox layer and check to see if its output
-// matches the given result
-void doOneDetectionOutputTest(MatrixPtr& inputLoc,
-                              MatrixPtr& inputConf,
-                              MatrixPtr& inputPriorBox,
-                              size_t feature_map_width,
-                              size_t feature_map_height,
-                              real nms_threshold,
-                              bool use_gpu,
-                              MatrixPtr& result) {
-  // Setting up the detection output layer
-  TestConfig configt;
-  configt.layerConfig.set_type("detection_output");
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  configt.layerConfig.add_inputs();
-  configt.layerConfig.add_inputs();
-
-  DetectionOutputConfig* detOutput = input->mutable_detection_output_conf();
-  detOutput->set_width(feature_map_width);
-  detOutput->set_height(feature_map_height);
-  detOutput->set_nms_threshold(nms_threshold);
-  detOutput->set_num_classes(2);
-  detOutput->set_nms_top_k(20);
-  detOutput->set_keep_top_k(10);
-  detOutput->set_background_id(0);
-  detOutput->set_confidence_threshold(0.01);
-  detOutput->set_input_num(1);
-  configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0});
-  configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0});
-  configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0});
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox);
-  dataLayers[1]->getOutputValue()->copyFrom(*inputLoc);
-  dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
-
-  // test layer initialize
-  bool store_FLAGS_use_gpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = use_gpu;
-  std::vector<ParameterPtr> parameters;
-  LayerPtr detectionOutputLayer;
-  initTestLayer(configt, &layerMap, &parameters, &detectionOutputLayer);
-  FLAGS_use_gpu = store_FLAGS_use_gpu;
-  detectionOutputLayer->forward(PASS_GC);
-  checkMatrixEqual(detectionOutputLayer->getOutputValue(), result);
-}
-
-TEST(Layer, detectionOutputLayerFwd) {
-  bool useGpu = false;
-  // CPU case 1.
-  MatrixPtr inputLoc;
-  MatrixPtr inputConf;
-  MatrixPtr inputPriorBox;
-  MatrixPtr result, result2, result3, result4;
-  real nmsTreshold = 0.01;
-  real inputLocData[] = {0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1,
-                         0.1};
-  real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6};
-  real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,
-                              0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,
-                              0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,
-                              0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2};
-  real resultData[] = {
-      0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031};
-  inputLoc = Matrix::create(1, 16, false, useGpu);
-  inputConf = Matrix::create(1, 8, false, useGpu);
-  inputPriorBox = Matrix::create(1, 32, false, useGpu);
-  result = Matrix::create(1, 7, false, useGpu);
-  inputLoc->setData(inputLocData);
-  inputConf->setData(inputConfData);
-  inputPriorBox->setData(inputPriorBoxData);
-  result->setData(resultData);
-  doOneDetectionOutputTest(inputLoc,
-                           inputConf,
-                           inputPriorBox,
-                           /* feature_map_width */ 1,
-                           /* feature_map_height */ 1,
-                           nmsTreshold,
-                           useGpu,
-                           result);
-
-  // CPU case 2.
-  nmsTreshold = 0.2;
-  result2 = Matrix::create(2, 7, false, useGpu);
-  real resultData2[] = {0,
-                        1,
-                        0.68997443,
-                        0.099959746,
-                        0.099959746,
-                        0.50804031,
-                        0.50804031,
-                        0,
-                        1,
-                        0.59868765,
-                        0.29995975,
-                        0.29995975,
-                        0.70804024,
-                        0.70804024};
-  result2->setData(resultData2);
-  doOneDetectionOutputTest(inputLoc,
-                           inputConf,
-                           inputPriorBox,
-                           /* feature_map_width */ 1,
-                           /* feature_map_height */ 1,
-                           nmsTreshold,
-                           useGpu,
-                           result2);
-
-#ifdef PADDLE_WITH_CUDA
-  // GPU case 1.
-  useGpu = true;
-  inputLoc = Matrix::create(1, 16, false, useGpu);
-  inputConf = Matrix::create(1, 8, false, useGpu);
-  inputPriorBox = Matrix::create(1, 32, false, useGpu);
-  inputLoc->copyFrom(inputLocData, 16);
-  inputConf->copyFrom(inputConfData, 8);
-  inputPriorBox->copyFrom(inputPriorBoxData, 32);
-
-  nmsTreshold = 0.01;
-  result3 = Matrix::create(1, 7, false, useGpu);
-  result3->copyFrom(resultData, 7);
-  doOneDetectionOutputTest(inputLoc,
-                           inputConf,
-                           inputPriorBox,
-                           /* feature_map_width */ 1,
-                           /* feature_map_height */ 1,
-                           nmsTreshold,
-                           useGpu,
-                           result3);
-
-  // GPU case 2.
-  nmsTreshold = 0.2;
-  result4 = Matrix::create(2, 7, false, useGpu);
-  result4->copyFrom(resultData2, 14);
-  doOneDetectionOutputTest(inputLoc,
-                           inputConf,
-                           inputPriorBox,
-                           /* feature_map_width */ 1,
-                           /* feature_map_height */ 1,
-                           nmsTreshold,
-                           useGpu,
-                           result4);
-#endif
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_Evaluator.cpp b/paddle/legacy/gserver/tests/test_Evaluator.cpp
deleted file mode 100644
index 8aab50d23..000000000
--- a/paddle/legacy/gserver/tests/test_Evaluator.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/trainer/Trainer.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-enum InputType {
-  INPUT_DATA,         // dense vector
-  INPUT_LABEL,        // id
-  INPUT_DATA_TARGET,  // dense vector, but no gradient
-  INPUT_SEQUENCE_DATA,
-  INPUT_SEQUENCE_LABEL,
-  INPUT_SPARSE_NON_VALUE_DATA
-};
-
-struct InputDef {
-  InputType inputType;
-  string name;
-  size_t dim;
-};
-
-struct TestConfig {
-  EvaluatorConfig evaluatorConfig;
-  std::vector<InputDef> inputDefs;
-  bool testAccumulate;
-  TestConfig() : testAccumulate(true) {}
-};
-
-void testEvaluator(TestConfig testConf,
-                   string testEvaluatorName,
-                   size_t batchSize,
-                   bool useGpu) {
-#ifndef PADDLE_WITH_CUDA
-  if (useGpu) return;
-#endif
-  FLAGS_use_gpu = useGpu;
-  testConf.evaluatorConfig.set_name(testEvaluatorName);
-  LOG(INFO) << " evaluator_type=" << testConf.evaluatorConfig.type()
-            << " useGpu=" << useGpu;
-
-  std::vector<Argument> arguments;
-  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
-    Argument data;
-    size_t dim = testConf.inputDefs[i].dim;
-    switch (testConf.inputDefs[i].inputType) {
-      case INPUT_DATA:
-      case INPUT_SEQUENCE_DATA:
-      case INPUT_DATA_TARGET:
-        data.value = Matrix::create(batchSize, dim, false, useGpu);
-        data.value->randomizeUniform();
-
-        // make sure output > 0 && output < 1
-        data.value->add(-0.5);
-        data.value->sigmoid(*data.value);
-        break;
-      case INPUT_LABEL:
-      case INPUT_SEQUENCE_LABEL:
-        data.ids = VectorT<int>::create(batchSize, useGpu);
-        data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
-        break;
-      case INPUT_SPARSE_NON_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize,
-                                            dim,
-                                            /* withValue= */ false,
-                                            useGpu);
-        break;
-      default:
-        LOG(FATAL) << " unknown inputType ";
-        return;
-    }
-
-    ICpuGpuVectorPtr sequenceStartPositions;
-    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
-        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL) {
-      if (!sequenceStartPositions) {
-        generateSequenceStartPositions(batchSize, sequenceStartPositions);
-      }
-      data.sequenceStartPositions = sequenceStartPositions;
-    }
-
-    arguments.push_back(data);
-  }
-
-  Evaluator* testEvaluator = Evaluator::create(testConf.evaluatorConfig);
-  double totalScore = 0.0;
-  testEvaluator->start();
-  totalScore += testEvaluator->evalImp(arguments);
-  testEvaluator->updateSamplesNum(arguments);
-  testEvaluator->finish();
-  LOG(INFO) << *testEvaluator;
-
-  std::vector<std::string> names;
-  testEvaluator->getNames(&names);
-  paddle::Error err;
-  for (auto& name : names) {
-    auto value = testEvaluator->getValue(name, &err);
-    ASSERT_TRUE(err.isOK());
-    LOG(INFO) << name << " " << value;
-    auto tp = testEvaluator->getType(name, &err);
-    ASSERT_TRUE(err.isOK());
-    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
-  }
-
-  double totalScore2 = 0.0;
-  if (testConf.testAccumulate) {
-    testEvaluator->start();
-    totalScore2 += testEvaluator->evalImp(arguments);
-    testEvaluator->finish();
-    EXPECT_LE(fabs(totalScore - totalScore2), 1.0e-5);
-  }
-}
-
-void testEvaluatorAll(TestConfig testConf,
-                      string testEvaluatorName,
-                      size_t batchSize) {
-  testEvaluator(testConf, testEvaluatorName, batchSize, true);
-  testEvaluator(testConf, testEvaluatorName, batchSize, false);
-}
-
-TEST(Evaluator, detection_map) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("detection_map");
-  config.evaluatorConfig.set_overlap_threshold(0.5);
-  config.evaluatorConfig.set_background_id(0);
-  config.evaluatorConfig.set_ap_type("Integral");
-  config.evaluatorConfig.set_evaluate_difficult(0);
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 7});
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6});
-  config.evaluatorConfig.set_evaluate_difficult(false);
-  testEvaluatorAll(config, "detection_map", 100);
-
-  config.evaluatorConfig.set_evaluate_difficult(true);
-  testEvaluatorAll(config, "detection_map", 100);
-}
-
-TEST(Evaluator, classification_error) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("classification_error");
-  config.evaluatorConfig.set_top_k(5);
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 50});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 50});
-  testEvaluatorAll(config, "classification_error", 100);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "classification_error_weight", 100);
-
-  // multi binary labels
-  config.inputDefs.clear();
-  config.inputDefs.push_back({INPUT_DATA, "output", 100});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 100});
-  // Not support GPU
-  testEvaluator(config, "classification_error_multi_binary_label", 50, false);
-
-  config.evaluatorConfig.set_classification_threshold(0.4);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  // Not support GPU
-  testEvaluator(
-      config, "classification_error_weight_multi_binary_label", 50, false);
-}
-
-TEST(Evaluator, sum) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("sum");
-
-  // sum of output
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  testEvaluatorAll(config, "sum_output", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "sum_output_weight", 200);
-
-  // sum of label
-  config.inputDefs.clear();
-  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
-  testEvaluatorAll(config, "sum_label", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "sum_label_weight", 200);
-}
-
-TEST(Evaluator, last_column_sum) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("last-column-sum");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 50});
-  testEvaluatorAll(config, "last-column-sum", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "last-column-sum_weight", 200);
-}
-
-TEST(Evaluator, last_column_auc) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("last-column-auc");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 2});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 2});
-  testEvaluatorAll(config, "last-column-auc", 500);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "last-column-auc_weight", 200);
-}
-
-TEST(Evaluator, precision_recall) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("precision_recall");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
-  testEvaluatorAll(config, "precision_recall", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "precision_recall_weight", 200);
-
-  LOG(INFO) << "positive_label = 5";
-  config.evaluatorConfig.set_positive_label(5);
-  testEvaluatorAll(config, "precision_recall_weight", 200);
-
-  // multi binary labels
-  config.inputDefs.clear();
-  config.evaluatorConfig.set_positive_label(-1);
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 10});
-  // Not support GPU
-  testEvaluator(config, "precision_recall_multi_binary_label", 100, false);
-
-  LOG(INFO) << "classification_threshold = 0.4";
-  config.evaluatorConfig.set_classification_threshold(0.4);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  // Not support GPU
-  testEvaluator(
-      config, "precision_recall_weight_multi_binary_label", 100, false);
-}
-
-TEST(Evaluator, ctc_error_evaluator) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("ctc_edit_distance");
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "output", 32});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "label", 1});
-  testEvaluatorAll(config, "ctc_error_evaluator", 100);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_Expand.cpp b/paddle/legacy/gserver/tests/test_Expand.cpp
deleted file mode 100644
index fa1c86d13..000000000
--- a/paddle/legacy/gserver/tests/test_Expand.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-// Do one forward pass of expand layer and check to see if its output
-// matches the given result.(Test onlyCPU currently.)
-void doOneExpandTest(string trans_type,
-                     bool hasSubseq,
-                     bool useGpu,
-                     Argument& input1,
-                     Argument& input2,
-                     Argument& result) {
-  FLAGS_use_gpu = false;
-  // Setting up the expand layer
-  TestConfig config;
-  config.layerConfig.set_type("expand");
-
-  auto inputType1 =
-      trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA;
-  config.inputDefs.push_back({inputType1, "layer0", 1, 0});
-  auto inputType2 =
-      hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA;
-
-  config.inputDefs.push_back({inputType2, "layer1", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      config, &dataLayers, &datas, &layerMap, "expand", 1, false, useGpu);
-  dataLayers[0]->getOutput() = input1;
-  dataLayers[1]->getOutput() = input2;
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr expandLayer;
-  initTestLayer(config, &layerMap, &parameters, &expandLayer);
-  expandLayer->forward(PASS_GC);
-  checkMatrixEqual(expandLayer->getOutputValue(), result.value);
-}
-
-TEST(Layer, ExpandLayerFwd) {
-  bool useGpu = false;
-
-  // Assume batch_size =3 in all cases.
-
-  // CPU case 1. non-seq expand to seq
-  // input1 = 1,2,3
-  // input2 = [4,5],[6],[7,8,9]
-  // result = [1,1],[2],[3,3,3]
-  Argument input1, input2, result;
-  input1.value = Matrix::create(3, 1, false, useGpu);
-  real input1Data[] = {1, 2, 3};
-  input1.value->setData(input1Data);
-
-  input2.value = Matrix::create(6, 1, false, useGpu);
-  real input2Data[] = {4, 5, 6, 7, 8, 9};
-  input2.value->setData(input2Data);
-  input2.sequenceStartPositions = ICpuGpuVector::create(4, useGpu);
-  int input2Seq[] = {0, 2, 3, 6};
-  input2.sequenceStartPositions->copyFrom(input2Seq, 4, useGpu);
-
-  result.value = Matrix::create(6, 1, false, useGpu);
-  real resultData[] = {1, 1, 2, 3, 3, 3};
-  result.value->setData(resultData);
-
-  doOneExpandTest("non-seq", false, useGpu, input1, input2, result);
-
-  // CPU case 2. non-seq expand to sub-seq
-  // NOTE: input1.batch_size == input2.sequencelength in this case.
-  // i.e, input1 expands by input2.sequence
-  // input1 = 1,2,3
-  // input2 = [[4,5]],[[6]],[[7],[8,9]]
-  // result = [[1,1]],[[2]],[[3],[3,3]]
-  input2.subSequenceStartPositions = ICpuGpuVector::create(5, useGpu);
-  int input2SubSeq[] = {0, 2, 3, 4, 6};
-  input2.subSequenceStartPositions->copyFrom(input2SubSeq, 5, useGpu);
-
-  doOneExpandTest("non-seq", true, useGpu, input1, input2, result);
-
-  // CPU case 3. seq expand to sub-seq
-  // input1 = [1,2],[3],[4]
-  // input2 = [[4,5]],[[6]],[[7],[8,9]]
-  // result = [[1,1]],[[2]],[[3],[4,4]]
-  Matrix::resizeOrCreate(input1.value, 4, 1, false, useGpu);
-  real input1Data_case3[] = {1, 2, 3, 4};
-  input1.value->setData(input1Data_case3);
-
-  input1.sequenceStartPositions = ICpuGpuVector::create(4, useGpu);
-  int input1Seq[] = {0, 2, 3, 4};
-  input1.sequenceStartPositions->copyFrom(input1Seq, 4, useGpu);
-
-  real resultData_case3[] = {1, 1, 2, 3, 4, 4};
-  result.value->setData(resultData_case3);
-
-  doOneExpandTest("seq", true, useGpu, input1, input2, result);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
deleted file mode 100644
index e15b4e503..000000000
--- a/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-vector<int> randSampling(int range, int n) {
-  CHECK_GE(range, n);
-  vector<int> num(range);
-  iota(begin(num), end(num), 0);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  return num;
-}
-
-void genRandomSeqInfo(vector<int>& seqStartPosition,
-                      vector<int>& subSeqStartPosition) {
-  const int maxSeqNum = 100;
-  // generate random start position information
-  int seqNum = 1 + (rand() % maxSeqNum);
-  seqStartPosition.resize(seqNum + 1, 0);
-  subSeqStartPosition.resize(1, 0);
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqLen = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqLen; ++j)
-      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
-    seqStartPosition[i + 1] = subSeqStartPosition.back();
-  }
-}
-
-void genRandomGroundTruth(real* values,
-                          vector<vector<int>>& groundTruth,
-                          vector<int>& startPos,
-                          size_t beamSize) {
-  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
-  for (size_t i = 0; i < startPos.size() - 1; ++i) {
-    int seqLen = startPos[i + 1] - startPos[i];
-    vector<int> pos =
-        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
-    for (size_t j = 0; j < pos.size(); ++j) {
-      groundTruth[i][j] = pos[j];
-      values[startPos[i] + pos[j]] = 1.;
-    }
-  }
-}
-
-void checkLayerOut(vector<vector<int>> groundTruth,
-                   real* layerOut,
-                   size_t beamSize) {
-  for (size_t i = 0; i < groundTruth.size(); ++i) {
-    int begPos = i * beamSize;
-    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
-    sort(begin(tmp), end(tmp));
-    sort(begin(groundTruth[i]), end(groundTruth[i]));
-    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
-  }
-}
-
-TEST(Layer, kmaxSeqScoreLayer) {
-  const size_t maxBeamSize = 100;
-  size_t beamSize = 1 + (rand() % maxBeamSize);
-
-  vector<int> seqStartPosition;
-  vector<int> subSeqStartPosition;
-  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
-  MatrixPtr inValue =
-      Matrix::create(subSeqStartPosition.back(), 1, false, false);
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-
-  for (auto hasSubseq : {false, true}) {
-    vector<vector<int>> groundTruth;
-    inValue->randomizeUniform();
-    genRandomGroundTruth(inValue->getData(),
-                         groundTruth,
-                         hasSubseq ? subSeqStartPosition : seqStartPosition,
-                         beamSize);
-
-    for (auto useGpu : mode) {
-      TestConfig config;
-      config.layerConfig.set_type("kmax_seq_score");
-      config.layerConfig.set_beam_size(beamSize);
-
-      if (hasSubseq) {
-        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                    "scores",
-                                    inValue,
-                                    seqStartPosition,
-                                    subSeqStartPosition});
-      } else {
-        config.inputDefs.push_back(
-            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
-      }
-      config.layerConfig.add_inputs();
-
-      // data layer initialize
-      std::vector<DataLayerPtr> dataLayers;
-      LayerMap layerMap;
-      vector<Argument> datas;
-      initDataLayer(
-          config,
-          &dataLayers,
-          &datas,
-          &layerMap,
-          "kmax_seq_score",
-          100 /* actually this parameter is unused in self-defined input*/,
-          false,
-          useGpu);
-      // test layer initialize
-      std::vector<ParameterPtr> parameters;
-      LayerPtr kmaxSeqScoreLayer;
-      FLAGS_use_gpu = useGpu;
-      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
-      kmaxSeqScoreLayer->forward(PASS_TRAIN);
-
-      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
-      CHECK_EQ(outValue->getHeight(),
-               hasSubseq ? subSeqStartPosition.size() - 1
-                         : seqStartPosition.size() - 1);
-      CHECK_EQ(outValue->getWidth(), beamSize);
-      checkLayerOut(groundTruth, outValue->getData(), beamSize);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand((size_t)(time(NULL)));
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_LayerGrad.cpp b/paddle/legacy/gserver/tests/test_LayerGrad.cpp
deleted file mode 100644
index 979cf8ee6..000000000
--- a/paddle/legacy/gserver/tests/test_LayerGrad.cpp
+++ /dev/null
@@ -1,2532 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-#include <cudnn.h>
-#endif
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-TEST(Operator, dot_mul) {
-  TestConfig config;
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("dot_mul");
-  operatorConf.set_dotmul_scale(-1);
-
-  testOperatorGrad(config, operatorConf, 100, false, false);
-}
-
-TEST(Projection, context) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20}) {
-        for (auto trainablePadding : {false, true}) {
-          LOG(INFO) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " batchSize=" << batchSize
-                    << " trainablePadding=" << trainablePadding;
-          ProjectionConfig conf;
-          conf.set_type("context");
-          conf.set_input_size(10);
-          conf.set_context_start(contextStart);
-          conf.set_context_length(contextLength);
-          conf.set_trainable_padding(trainablePadding);
-          conf.set_output_size(conf.context_length() * conf.input_size());
-          int pad =
-              std::max(0, -conf.context_start()) +
-              std::max(0, conf.context_start() + conf.context_length() - 1);
-          for (auto useGpu : {false, true}) {
-            testProjectionGrad(
-                conf,
-                INPUT_SEQUENCE_DATA,
-                trainablePadding ? conf.input_size() * pad : 0,
-                batchSize,
-                useGpu,
-                contextStart + contextLength <= 1);  // = testState
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Projection, trans_fc) {
-  ProjectionConfig conf;
-  conf.set_type("trans_fc");
-  conf.set_input_size(50);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1000,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, fc) {
-  ProjectionConfig conf;
-  conf.set_type("fc");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, dot_mul) {
-  ProjectionConfig conf;
-  conf.set_type("dot_mul");
-  conf.set_input_size(20);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 20,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, table) {
-  ProjectionConfig conf;
-  conf.set_type("table");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_LABEL,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, identity) {
-  ProjectionConfig conf;
-  conf.set_type("identity");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, slice) {
-  ProjectionConfig conf;
-  conf.set_type("slice");
-  conf.set_input_size(100);
-  SliceConfig& slice1 = *conf.add_slices();
-  slice1.set_start(10);
-  slice1.set_end(20);
-  SliceConfig& slice2 = *conf.add_slices();
-  slice2.set_start(50);
-  slice2.set_end(70);
-  conf.set_output_size(30);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 10,
-                       useGpu);
-  }
-}
-
-TEST(Projection, scaling) {
-  ProjectionConfig conf;
-  conf.set_type("scaling");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-void testProjectionConv(size_t groups, bool isDeconv) {
-  const int NUM_FILTERS = 18;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 2;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-
-#if CUDNN_VERSION >= 6000
-  const int DILATION = 2;
-#else
-  const int DILATION = 1;
-#endif
-
-  ProjectionConfig conf;
-  if (isDeconv) {
-    conf.set_type("convt");
-  } else {
-    conf.set_type("conv");
-  }
-  conf.set_num_filters(NUM_FILTERS);
-
-  ConvConfig* conv = conf.mutable_conv_conf();
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(DILATION);
-  conv->set_dilation_y(DILATION);
-  conv->set_groups(groups);
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-  }
-  conv->set_img_size(IMAGE_SIZE);
-  int output_x = outputSize(conv->img_size(),
-                            (conv->filter_size() - 1) * DILATION + 1,
-                            conv->padding(),
-                            conv->stride(),
-                            /* caffeMode */ true);
-  int output_y = outputSize(conv->img_size(),
-                            (conv->filter_size_y() - 1) * DILATION + 1,
-                            conv->padding_y(),
-                            conv->stride_y(),
-                            /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  conv->set_output_y(output_y);
-  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
-            << "; output_y: " << output_y;
-  if (isDeconv) {
-    int deconv_image_x = imageSize(output_x,
-                                   (conv->filter_size() - 1) * DILATION + 1,
-                                   conv->padding(),
-                                   conv->stride(),
-                                   /* caffeMode */ true);
-    int deconv_image_y = imageSize(output_y,
-                                   (conv->filter_size_y() - 1) * DILATION + 1,
-                                   conv->padding_y(),
-                                   conv->stride_y(),
-                                   /* caffeMode */ true);
-
-    LOG(INFO) << " deconv_image_x: " << deconv_image_x
-              << "; deconv_image_y: " << deconv_image_y;
-    conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
-  } else {
-    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-    conf.set_output_size(output_x * output_y * NUM_FILTERS);
-  }
-
-  testProjectionGrad(conf,
-                     INPUT_DATA,
-                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
-                         FILTER_SIZE_Y / groups,
-                     /* batchSize */ 100,
-                     true,
-                     false,
-                     NUM_FILTERS,
-                     true);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Projection, conv) {
-  /// test ConvProjection
-  testProjectionConv(1, false);
-  testProjectionConv(3, false);
-  /// test ConvTransProjection
-  testProjectionConv(1, true);
-  testProjectionConv(3, true);
-}
-#endif
-
-TEST(Layer, BilinearInterpLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("bilinear_interp");
-  config.biasSize = 0;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  ImageConfig* image = bilinear->mutable_image_conf();
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-
-  for (auto useGpu : {false, true}) {
-    for (auto outSize : {32, 64}) {
-      bilinear->set_out_size_x(outSize);
-      bilinear->set_out_size_y(outSize);
-      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, concat) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("concat");
-  config.layerConfig.set_size(15);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "concat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, AddtoLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "addto", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CTCLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("ctc");
-  config.layerConfig.set_norm_by_times(false);
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "ctc",
-                  100,
-                  /* trans */ false, /* useGpu */
-                  useGpu);
-  }
-}
-
-TEST(Layer, cosSimLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CosSimVecMatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos_vm");
-  config.layerConfig.set_size(5);  // output size
-  config.layerConfig.set_cos_scale(2.0);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos_vm", 100, false, useGpu);
-  }
-}
-
-void testDepthwiseConvLayer(const string& type, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 32;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(32);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(16);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(8);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
-}
-
-TEST(Layer, depthwiseConvLayer) {
-  //  'depthwise_conv' is a sepecial case of 'exconv' whose
-  //  groups size equals to the input channels size.
-  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
-#endif
-}
-
-void testConvLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  int dilation = 2;
-  if (type == "cudnn_conv") {
-#if CUDNN_VERSION >= 6000
-    dilation = 2;
-#else
-    dilation = 1;
-#endif
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(2);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(dilation);
-  conv->set_dilation_y(dilation);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                (conv->filter_size() - 1) * dilation + 1,
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                (conv->filter_size_y() - 1) * dilation + 1,
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "conv", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convLayer) {
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
-  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testConvTransLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 3;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(3);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-
-  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "convTrans", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convTransLayer) {
-  for (auto useGpu : {false, true}) {
-    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
-  }
-#ifdef PADDLE_WITH_CUDA
-  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, blockExpandLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("blockexpand");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
-  blockExpand->set_img_size_x(64);
-  blockExpand->set_img_size_y(32);
-  blockExpand->set_channels(3);
-  blockExpand->set_padding_x(0);
-  blockExpand->set_padding_y(0);
-  blockExpand->set_block_x(4);
-  blockExpand->set_block_y(32);
-  blockExpand->set_stride_x(2);
-  blockExpand->set_stride_y(2);
-  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
-                                       blockExpand->block_x(),
-                                       blockExpand->padding_x(),
-                                       blockExpand->stride_x(),
-                                       /* caffeMode */ false));
-  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
-                                       blockExpand->block_y(),
-                                       blockExpand->padding_y(),
-                                       blockExpand->stride_y(),
-                                       /* caffeMode */ false));
-  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
-                              blockExpand->channels());
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "blockexpand", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, maxoutLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("maxout");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MaxOutConfig* maxout = input->mutable_maxout_conf();
-  ImageConfig* image = maxout->mutable_image_conf();
-
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-  maxout->set_groups(2);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "maxout", 10, false, useGpu);
-  }
-}
-
-void testFcLayer(string format, size_t nnz) {
-  TestConfig config;
-  config.biasSize = 1024;
-  config.layerConfig.set_type("fc");
-  config.layerConfig.set_size(1024);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_drop_rate(0.1);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
-  config.layerConfig.add_inputs();
-
-  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
-            << config.inputDefs[0].sparse.format;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "fc",
-                  100,
-                  /* trans */ false,
-                  useGpu,
-                  /* weight */ true);
-  }
-}
-
-TEST(Layer, fcLayer) {
-  testFcLayer("", 1024 * 1024 * 2);
-  testFcLayer("csc", 1024 * 10);
-  testFcLayer("csr", 1024 * 10);
-}
-
-TEST(Layer, SelectiveFullyConnectedLayer) {
-  TestConfig config;
-  size_t nin = 16;
-  size_t nout = 256;
-  config.layerConfig.set_type("selective_fc");
-  config.layerConfig.set_size(nout);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_has_selected_colums(true);
-  config.layerConfig.set_selective_fc_pass_generation(false);
-  config.biasSize = nout;
-
-  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
-  config.layerConfig.add_inputs();
-
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ false,
-                false);
-#ifdef PADDLE_WITH_CUDA
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ true,
-                false);
-#endif
-}
-
-TEST(Layer, DataNormLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("data_norm");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
-  config.inputDefs.back().isStatic = true;
-  config.layerConfig.add_inputs();
-
-  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
-    config.layerConfig.set_data_norm_strategy(strategy);
-    // The parameters are static, so not support GPU now
-    testLayerGrad(config,
-                  "data_norm",
-                  200,
-                  /* trans */ false,
-                  /* useGpu */ false);
-  }
-}
-
-TEST(Layer, hsigmoidLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("hsigmoid");
-  config.layerConfig.set_num_classes(5);
-  config.layerConfig.set_size(1);
-  config.biasSize = config.layerConfig.num_classes() - 1;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "hsigmoid",
-                  100,
-                  /* trans */ false,
-                  /* useGpu */ useGpu);
-  }
-}
-
-TEST(Layer, multi_cross) {
-  TestConfig config;
-  config.layerConfig.set_type("multi-class-cross-entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(
-        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, multi_binary_label_sparse_mat) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(layer, multi_binary_label_id) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, multi_cross_with_selfnorm) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
-  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "multi_class_cross_entropy_with_selfnorm",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, multi_cross_soft) {
-  TestConfig config;
-  config.layerConfig.set_type("soft_binary_class_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "soft_binary_class_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, sparse_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, sparse_float_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, square_error_weighted) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, huber_regression_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_regression");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto delta : {1, 3, 5}) {
-      config.layerConfig.set_delta(delta);
-      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, huber_two_class) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_classification");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
-  }
-}
-
-void testExpandLayer(string trans_type, bool hasSubseq) {
-  TestConfig config;
-  config.layerConfig.set_type("expand");
-
-  config.inputDefs.push_back(
-      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_1",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "expand", 30, false, useGpu);
-  }
-}
-
-TEST(Layer, ExpandLayer) {
-  testExpandLayer("non-seq", false);  // non-seq expand to seq
-  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
-  testExpandLayer("seq", true);       // seq expand to hasSubseq
-}
-
-void testDegradeLayer(bool hasSubseq,
-                      string layer_type,
-                      string trans_type,
-                      int stride) {
-  TestConfig config;
-  config.layerConfig.set_type(layer_type);
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_seq_pool_stride(stride);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-
-  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, layer_type, 100, false, useGpu);
-    }
-  };
-
-  if (layer_type == "average") {
-    for (auto strategy : {"average", "sum", "squarerootn"}) {
-      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-                << " average_strategy=" << strategy
-                << " seq_pool_stride=" << stride;
-      config.layerConfig.set_average_strategy(strategy);
-      testDegradeLayerGrad(config, layer_type);
-    }
-  } else {
-    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-              << " seq_pool_stride=" << stride;
-    testDegradeLayerGrad(config, layer_type);
-  }
-}
-
-TEST(Layer, MaxLayer) {
-  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(false,
-                   "max",
-                   "non-seq",
-                   5);  // seq max to a shorten seq, stride window = 5
-  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
-}
-
-TEST(Layer, SequenceLastInstanceLayer) {
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // seq seqlastins to non-seq
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   5);  // seq seqlastins to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "seq",
-                   -1);  // hasSubseq seqlastins to seq
-}
-
-TEST(Layer, AverageLayer) {
-  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
-  testDegradeLayer(false,
-                   "average",
-                   "non-seq",
-                   5);  // seq average to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "average",
-                   "non-seq",
-                   -1);                          // hasSubseq average to non-seq
-  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
-}
-
-TEST(Layer, SequenceConcatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqconcat");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqconcat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SequenceReshapeLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqreshape");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqreshape", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvShiftLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("conv_shift");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config, "conv_shift", 100, false, false);
-}
-
-TEST(Layer, PowerLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("power");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "power", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvexCombinationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("convex_comb");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "convex_comb", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, InterpolationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("interpolation");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "interpolation", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, DotProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("dot_prod");
-  config.layerConfig.set_size(1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "dot_prod", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, OuterProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("out_prod");
-  config.layerConfig.set_size(100);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "out_prod", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SlopeInterceptLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("slope_intercept");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_slope(1.0);
-  config.layerConfig.set_intercept(0.1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ScalingLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("scaling");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scaling", 100, false, useGpu);
-  }
-}
-
-void testNormLayer(const string& normType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_active_type("relu");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type(normType);
-  norm->set_channels(16);
-  norm->set_size(5);
-  norm->set_scale(0.001);
-  norm->set_pow(0.75);
-  norm->set_blocked(0);
-  norm->set_img_size(14);
-  norm->set_img_size_y(7);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  if (norm->norm_type() == "cmrnorm" ||
-      norm->norm_type() == "cmrnorm-projection") {
-    norm->set_scale(norm->scale() / norm->size());
-  } else {
-    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
-  }
-
-  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
-                              norm->channels());
-  config.biasSize = 0;
-
-  testLayerGrad(config, "norm", 100, trans, useGpu);
-}
-
-TEST(Layer, NormLayer) {
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                true);
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                false);
-}
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(16);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(16);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void testPoolLayer(const string& poolType,
-                   bool trans,
-                   bool useGpu,
-                   bool excludeMode = true) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(14);
-  pool->set_img_size_y(14);
-  pool->set_exclude_mode(excludeMode);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_size_y(4);
-  pool->set_stride_y(3);
-  pool->set_img_size(10);
-  pool->set_img_size_y(20);
-  setPoolConfig(&config, pool, poolType);
-  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
-                         ((float)pool->stride_y()) +
-                     1.5);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-#endif
-
-TEST(Layer, PoolLayer) {
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ false,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
-
-#ifdef PADDLE_WITH_CUDA
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ true,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-incl-pad-pool",
-                 /* trans= */ false,
-                 /* useGpu= */ true);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void setPool3DConfig(TestConfig* config,
-                     PoolConfig* pool,
-                     const string& poolType) {
-  // filter size
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-  const int CHANNELS = 16;
-
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool3d");
-  (*config).layerConfig.set_num_filters(NUM_FILTERS);
-
-  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
-  int pw = 0, ph = 0, pd = 0;
-  int sw = 2, sh = 2, sd = 2;
-
-  pool->set_pool_type(poolType);
-  pool->set_pool_type("avg");
-  pool->set_channels(CHANNELS);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_size_z(kd);
-  pool->set_padding(0);
-  pool->set_padding_y(0);
-  pool->set_padding_z(0);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-  pool->set_stride_z(sd);
-  pool->set_start(0);
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-  pool->set_output_z(od);
-}
-
-void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  pool->set_img_size(IMAGE_SIZE);
-  pool->set_img_size_y(IMAGE_SIZE_Y);
-  pool->set_img_size_z(IMAGE_SIZE_Z);
-
-  setPool3DConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool3d", 100, trans, useGpu);
-}
-
-TEST(Layer, Pool3DLayer) {
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testSppLayer(const string& poolType,
-                  const int pyramidHeight,
-                  bool trans,
-                  bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("spp");
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  SppConfig* sppConfig = input->mutable_spp_conf();
-  sppConfig->set_pool_type(poolType);
-  sppConfig->set_pyramid_height(pyramidHeight);
-  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
-  imageConfig->set_channels(16);
-  imageConfig->set_img_size(10);
-  imageConfig->set_img_size_y(20);
-  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * imageConfig->channels());
-  testLayerGrad(config, "spp", 100, trans, useGpu);
-}
-
-TEST(Layer, SpatialPyramidPoolLayer) {
-  for (auto useGpu : {false, true}) {
-    for (auto pyramidHeight : {1, 2, 3}) {
-      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
-      testSppLayer("max-projection", pyramidHeight, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, rankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, sumCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("sum_cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "sum_cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, weightedRankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, TensorLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("tensor");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = config.layerConfig.size();
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "tensor", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.biasSize = 4;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 28;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
-    }
-  }
-  for (auto useGpu : {true}) {
-    config.testBatchState = true;
-    config.layerConfig.set_reversed(false);
-    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, MDLstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("mdlstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 4 * 9;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_directions(true);
-  config.layerConfig.add_directions(true);
-
-  for (auto useGpu : {false, true}) {
-    for (int i = 0; i < 2; i++) {
-      for (int j = 0; j < 2; j++) {
-        config.layerConfig.set_directions(0, bool(i));
-        config.layerConfig.set_directions(1, bool(j));
-        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
-      }
-    }
-  }
-}
-
-TEST(Layer, ParameterReluLayer) {
-  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
-    TestConfig config;
-    config.layerConfig.set_type("prelu");
-    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_size(inputSize);
-    config.layerConfig.set_partial_sum(inputSize /
-                                       channels);  // size of feature map
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "prelu", 100, false, useGpu);
-    }
-  };
-
-  testParameterReluLayer(192, 1);
-  testParameterReluLayer(192, 3);
-  testParameterReluLayer(192, 192);
-}
-
-TEST(Layer, ResizeLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("resize");
-  config.layerConfig.set_size(64);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "resize", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RotateLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("rotate");
-  const int CHANNEL = 2;
-  const int HEIGHT = 8;
-  const int WIDTH = 4;
-  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
-  config.layerConfig.set_size(INPUT_SIZE);
-  config.layerConfig.set_height(HEIGHT);
-  config.layerConfig.set_width(WIDTH);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rotate", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, NCELayer) {
-  TestConfig config;
-  size_t numClasses = 4;
-  config.layerConfig.set_type("nce");
-  config.layerConfig.set_size(1);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_num_classes(numClasses);
-  config.biasSize = numClasses;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
-  config.inputDefs.push_back(
-      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto withWeight : {false, true}) {
-    if (withWeight) {
-      config.inputDefs.push_back(
-          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
-      config.layerConfig.add_inputs();
-    }
-
-    for (auto isIdLabel : {false, true}) {
-      config.inputDefs[1] = {
-          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
-          "label",
-          /* dim= */ numClasses,
-          /* paraSize= */ 0};
-
-      for (auto withDist : {false, true}) {
-        config.layerConfig.clear_neg_sampling_dist();
-        if (withDist) {
-          double sum = 0;
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = rand();  // NOLINT use rand_r
-            config.layerConfig.add_neg_sampling_dist(p);
-            sum += p;
-          }
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = config.layerConfig.neg_sampling_dist(i) / sum;
-            config.layerConfig.set_neg_sampling_dist(i, p);
-          }
-        }
-        LOG(INFO) << "NCELayer "
-                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
-                  << " withDist=" << withDist;
-        // Not support GPU now
-        testLayerGrad(config,
-                      "nce",
-                      100,
-                      /* trans= */ false,
-                      /* useGpu */ false);
-      }
-    }
-  }
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gated_recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, GruStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gru_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, LstmStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstm_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, BatchNormalizationLayer) {
-  testBatchNormLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNormLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNormLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  const int IMG_SIZE_Z = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-  img_conf->set_img_size_z(IMG_SIZE_Z);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, testBatchNorm3DLayer) {
-  testBatchNorm3DLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNorm3DLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testConvOperator(bool isDeconv) {
-  TestConfig config;
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 9;
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  if (isDeconv) {
-    operatorConf.set_type("convt");
-  } else {
-    operatorConf.set_type("conv");
-  }
-  ConvConfig* conv = operatorConf.mutable_conv_conf();
-  operatorConf.set_num_filters(NUM_FILTERS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-    config.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                conv->output_x() * conv->output_y() * CHANNELS,
-                                0});
-    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
-    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                                NUM_FILTERS);
-  }
-
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_1",
-       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
-}
-
-TEST(Operator, conv) {
-  testConvOperator(/*isDeconv*/ true);
-  testConvOperator(/*isDeconv*/ false);
-}
-
-TEST(Layer, FeatureMapExpandLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("featmap_expand");
-  const int CHANNELS = 10;
-  const int INPUT_SIZE = 100;
-  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
-  config.layerConfig.set_num_filters(CHANNELS);
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              /* dim= */ INPUT_SIZE,
-                              /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    for (auto asRowVec : {false, true}) {
-      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
-      testLayerGrad(config,
-                    "featmap_expand",
-                    /*batch_size*/ 100,
-                    /* trans= */ false,
-                    useGpu,
-                    /* useWeight */ true);
-    }
-  }
-}
-
-TEST(Layer, MultiplexLayer) {
-  TestConfig config;
-  const int LAYER_SIZE = 100;
-  config.layerConfig.set_type("multiplex");
-  config.layerConfig.set_size(LAYER_SIZE);
-
-  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, PadLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("pad");
-
-  int c = 4;
-  int h = 31;
-  int w = 36;
-  size_t size = c * h * w;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PadConfig* pad = input->mutable_pad_conf();
-  ImageConfig* image = pad->mutable_image_conf();
-
-  image->set_channels(c);
-  image->set_img_size(h);
-  image->set_img_size_y(w);
-  pad->add_pad_c(1);
-  pad->add_pad_c(2);
-  pad->add_pad_h(2);
-  pad->add_pad_h(3);
-  pad->add_pad_w(3);
-  pad->add_pad_w(5);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "pad", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, CrossChannelNormLayer) {
-  TestConfig config;
-  config.paramInitialMean = 1.;
-  config.paramInitialStd = 0.;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_size(100);
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cross-channel-norm");
-  norm->set_channels(10);
-  norm->set_size(100);
-  norm->set_scale(0);
-  norm->set_pow(0);
-  norm->set_blocked(0);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
-  }
-}
-
-TEST(Layer, smooth_l1) {
-  TestConfig config;
-  config.layerConfig.set_type("smooth_l1");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, multibox_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("multibox_loss");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
-  multiboxLoss->set_num_classes(21);
-  multiboxLoss->set_input_num(1);
-  multiboxLoss->set_overlap_threshold(0.5);
-  multiboxLoss->set_neg_pos_ratio(3);
-  multiboxLoss->set_neg_overlap(0.5);
-  multiboxLoss->set_background_id(0);
-  multiboxLoss->set_height(3);
-  multiboxLoss->set_width(3);
-
-  size_t gtNum = 1;
-  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
-  labelValue->randomizeUniform();
-  labelValue->add(-0.5);
-  labelValue->sigmoid(*labelValue);
-  real* labelData = labelValue->getData();
-  size_t labelWidth = labelValue->getWidth();
-  for (size_t i = 0; i < gtNum; ++i) {
-    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
-    *(labelData + i * labelWidth + 1) = 0.400259;
-    *(labelData + i * labelWidth + 2) = 0.377857;
-    *(labelData + i * labelWidth + 3) = 0.525712;
-    *(labelData + i * labelWidth + 4) = 0.519368;
-  }
-  vector<int> seqStartPositions(gtNum + 1, 0);
-  for (size_t i = 1; i <= gtNum; ++i) {
-    seqStartPositions[i] = i;
-  }
-
-  // Ensure at lease one matched bbox
-  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
-  priorValue->randomizeUniform();
-  priorValue->add(-0.5);
-  priorValue->sigmoid(*priorValue);
-  real* priorData = priorValue->getData();
-  *(priorData) = 0.424811;
-  *(priorData + 1) = 0.397059;
-  *(priorData + 2) = 0.538905;
-  *(priorData + 3) = 0.447091;
-  *(priorData + 4) = 0.425720;
-  *(priorData + 5) = 0.515228;
-  *(priorData + 6) = 0.519452;
-  *(priorData + 7) = 0.591065;
-
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
-  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
-  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
-  }
-}
-
-TEST(Layer, TransLayer) {
-  TestConfig config;
-  const int height = 128;
-  const int width = 256;
-  config.layerConfig.set_type("trans");
-  config.layerConfig.set_size(width);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, RowConvLayer) {
-  const int context = 3;
-  const int size = 512;
-
-  TestConfig config;
-  config.layerConfig.set_type("row_conv");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  RowConvConfig* conv = input->mutable_row_conv_conf();
-  conv->set_context_length(context);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, CropLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  config.layerConfig.set_axis(2);
-  config.layerConfig.add_offset(0);
-  config.layerConfig.add_offset(0);
-
-  // config input_1
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
-  input = config.layerConfig.add_inputs();
-  img = input->mutable_image_conf();
-  img->set_channels(2);
-  img->set_img_size(8);
-
-  // config crop layer
-  config.layerConfig.set_type("crop");
-  config.layerConfig.set_name("cropLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "crop", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, roi_pool) {
-  TestConfig config;
-  config.layerConfig.set_type("roi_pool");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
-  roiPoolConf->set_pooled_width(7);
-  roiPoolConf->set_pooled_height(7);
-  roiPoolConf->set_spatial_scale(1. / 16);
-  roiPoolConf->set_width(14);
-  roiPoolConf->set_height(14);
-
-  const size_t roiNum = 10;
-  const size_t roiDim = 10;
-  const size_t batchSize = 5;
-  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
-  roiValue->zeroMem();
-  real* roiData = roiValue->getData();
-  for (size_t i = 0; i < roiNum; ++i) {
-    roiData[i * roiDim + 0] = std::rand() % batchSize;
-    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
-    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
-    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
-    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
-    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
-    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, SwitchOrderLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  img->set_img_size_y(16);
-
-  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
-  reshape->add_height_axis(0);
-  reshape->add_height_axis(1);
-  reshape->add_height_axis(2);
-  reshape->add_width_axis(3);
-
-  // config softmax layer
-  config.layerConfig.set_type("switch_order");
-  config.layerConfig.set_name("switchOrderLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
-  }
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-TEST(Layer, SubNestedSequenceLayer) {
-  // layer size is not crutial for this layer,
-  // so use a small layer size in unittest
-  const int layerSize = 4;
-
-  const int maxSeqNum = 50;
-  const int maxSeqLen = 50;
-  const int maxBeamSize = 32;
-
-  srand((size_t)(time(NULL)));
-  int beamSize = 1 + (rand() % maxBeamSize);
-
-  TestConfig config;
-  config.layerConfig.set_type("sub_nested_seq");
-  config.layerConfig.set_name("sub_nested_seq_layer");
-  config.layerConfig.set_size(layerSize);
-
-  int seqNum = 1 + (rand() % maxSeqNum);
-
-  // sequence information for the first input, it is a nested sequence
-  vector<int> seqStartPos(seqNum + 1, 0);
-  vector<int> subSeqStartPos(1, 0);
-
-  // selected indices
-  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
-  selectedIndices->one();
-  selectedIndices->mulScalar(-1.);
-  real* indicesData = selectedIndices->getData();
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqNum; ++j) {
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % maxSeqLen)));
-    }
-    vector<real> selSeqs =
-        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
-    memcpy(indicesData + (i * beamSize),
-           selSeqs.data(),
-           selSeqs.size() * sizeof(real));
-    seqStartPos[i + 1] = subSeqStartPos.back();
-  }
-
-  MatrixPtr seqInputPtr =
-      Matrix::create(seqStartPos.back(), layerSize, false, false);
-  seqInputPtr->randomizeUniform();
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                              "nested_seq_input",
-                              seqInputPtr,
-                              seqStartPos,
-                              subSeqStartPos});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sub_nested_seq",
-                  /* batchSize */ seqNum,
-                  /* trans */ false,
-                  /* useGpu*/ useGpu,
-                  /* useWeight */ false);
-  }
-}
-
-TEST(Layer, ClipLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("clip");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ClipConfig* layerConf = input->mutable_clip_conf();
-  double p1 = std::rand() / (double)RAND_MAX;
-  double p2 = std::rand() / (double)RAND_MAX;
-  layerConf->set_min(std::min(p1, p2));
-  layerConf->set_max(std::max(p1, p2));
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, RowL2NormLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("row_l2_norm");
-  config.layerConfig.set_size(size);
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
-  }
-}
-
-void test3DConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  // Setting up conv3D-trans layer
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_img_size_z(IMAGE_SIZE_Z);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-  conv->set_output_z(outputSize(conv->img_size_z(),
-                                conv->filter_size_z(),
-                                conv->padding_z(),
-                                conv->stride_z(),
-                                /*  caffeMode */ true));
-
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              conv->output_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "conv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DConvLayer) {
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 4;
-  const int IMAGE_SIZE_Y = 6;
-  const int IMAGE_SIZE_Z = 6;
-
-  // Setting up conv-trans layer
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type("deconv3d");
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_output_x(IMAGE_SIZE);
-  conv->set_output_y(IMAGE_SIZE_Y);
-  conv->set_output_z(IMAGE_SIZE_Z);
-
-  conv->set_img_size(imageSize(conv->output_x(),
-                               conv->filter_size(),
-                               conv->padding(),
-                               conv->stride(),
-                               true));
-  conv->set_img_size_y(imageSize(conv->output_y(),
-                                 conv->filter_size_y(),
-                                 conv->padding_y(),
-                                 conv->stride_y(),
-                                 true));
-  conv->set_img_size_z(imageSize(conv->output_z(),
-                                 conv->filter_size_z(),
-                                 conv->padding_z(),
-                                 conv->stride_z(),
-                                 true));
-  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
-                              conv->img_size_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DDeConvLayer) {
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, ScaleShiftLayer) {
-  // FIXME: Disable ScaleShiftLayer because it is not stable.
-  // https://github.com/PaddlePaddle/Paddle/issues/7781
-  return;
-  //  const size_t batchSize = 16;
-  //  const size_t size = 32;
-  //  TestConfig config;
-  //  config.layerConfig.set_type("scale_shift");
-  //  config.layerConfig.set_size(size);
-  //  config.biasSize = 1;
-  //  config.inputDefs.push_back(
-  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
-  //  config.layerConfig.add_inputs();
-  //  for (auto useGpu : {false, true}) {
-  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
-  //  }
-}
-
-TEST(Layer, ScaleSubRegionLayer) {
-  const size_t batchSize = 64;
-  const size_t size = 4096;
-  TestConfig config;
-  config.layerConfig.set_type("scale_sub_region");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
-  auto* data = indicesV->getData();
-  for (size_t i = 0; i < batchSize; ++i) {
-    data[i * 2] = 2;
-    data[i * 2 + 1] = 4;
-    data[i * 2 + 2] = 16;
-    data[i * 2 + 3] = 32;
-    data[i * 2 + 4] = 16;
-    data[i * 2 + 5] = 32;
-  }
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ScaleSubRegionConfig* scaleSubRegionConf =
-      input->mutable_scale_sub_region_conf();
-  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
-  imgConf->set_img_size(32);
-  imgConf->set_img_size_y(32);
-  imgConf->set_channels(4);
-  scaleSubRegionConf->set_value(2.0);
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, L2DistanceLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("l2_distance");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  const size_t input_dim = 27;
-  const size_t batch_size = 11;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
-  }
-}
-
-void testFactorizationMachineLayer(InputType type, bool useGpu) {
-  const int FACTOR_SIZE = 10;
-  TestConfig config;
-  config.layerConfig.set_type("factorization_machine");
-  config.layerConfig.set_factor_size(FACTOR_SIZE);
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-  config.inputDefs.push_back({type, "layer_0", 128, 1280});
-  config.layerConfig.add_inputs();
-  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
-}
-
-TEST(Layer, FactorizationMachineLayer) {
-  for (auto useGpu : {false, true}) {
-    testFactorizationMachineLayer(INPUT_DATA, useGpu);
-  }
-  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
deleted file mode 100644
index 7082c1363..000000000
--- a/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
-#include "paddle/legacy/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline bool getNextSequence(vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-TEST(LinearChainCRF, decoding) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-      vector<int> decodingResult(length);
-      vector<int> bestResult(length);
-      vector<int> testResult(length, 0);
-      crf.decode(x.getData(), &decodingResult[0], length);
-      real bestScore = -std::numeric_limits<real>::max();
-      do {
-        real score = a[testResult.front()] + b[testResult.back()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        if (score > bestScore) {
-          bestScore = score;
-          bestResult = testResult;
-        }
-      } while (getNextSequence(testResult, numClasses));
-      for (int k = 0; k < length; ++k) {
-        EXPECT_EQ(decodingResult[k], bestResult[k]);
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/gserver/tests/test_MKLDNN.cpp b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
deleted file mode 100644
index c79ccd195..000000000
--- a/paddle/legacy/gserver/tests/test_MKLDNN.cpp
+++ /dev/null
@@ -1,448 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <string>
-#include <vector>
-#include "MKLDNNTester.h"
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/activations/MKLDNNActivation.h"
-#include "paddle/legacy/math/MathUtils.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(use_gpu);
-DECLARE_bool(use_mkldnn);
-
-#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
-  MKLDNNTester tester;                                        \
-  for (auto bs : {DESC.bs, 1}) {                              \
-    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
-  }
-
-#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
-  TestConfig ref = DNN_CONFIG;                            \
-  ref.layerConfig.set_type(REF_TYPE);                     \
-  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
-
-struct testFcDesc {
-  int bs;
-  int ic;
-  int ih, iw;  // oh == ow == 1
-  int oc;
-};
-
-static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_fc");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.oc);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
-  cfg.layerConfig.add_inputs();
-}
-
-void testFcLayer(const testFcDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNFcConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
-  }
-}
-
-TEST(MKLDNNLayer, FcLayer) {
-  /* bs, ic, ih, iw, oc */
-  testFcLayer({2, 2, 1, 1, 3});
-  testFcLayer({3, 7, 1, 1, 19});
-  testFcLayer({8, 16, 13, 13, 32});
-  testFcLayer({4, 12, 13, 13, 18});
-  testFcLayer({2, 64, 16, 16, 32});
-  testFcLayer({15, 3, 16, 16, 6});
-}
-
-struct testConvDesc {
-  int bs, gp;
-  int ic, ih, iw;
-  int oc, oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-  int dh, dw;
-};
-
-static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_conv");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_num_filters(pm.oc);
-  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
-  cfg.layerConfig.set_shared_biases(true);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_groups(pm.gp);
-  conv->set_img_size(pm.iw);
-  conv->set_img_size_y(pm.ih);
-  conv->set_output_x(pm.ow);
-  conv->set_output_y(pm.oh);
-  conv->set_filter_size(pm.fw);
-  conv->set_filter_size_y(pm.fh);
-  conv->set_channels(pm.ic);
-  conv->set_padding(pm.pw);
-  conv->set_padding_y(pm.ph);
-  conv->set_stride(pm.sw);
-  conv->set_stride_y(pm.sh);
-  conv->set_dilation(pm.dw);
-  conv->set_dilation_y(pm.dh);
-  conv->set_caffe_mode(true);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
-      << "it is indivisible";
-
-  int fh = (pm.fh - 1) * pm.dh + 1;
-  int fw = (pm.fw - 1) * pm.dw + 1;
-  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
-  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testConvLayer(const testConvDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNConvConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
-  }
-}
-
-TEST(MKLDNNLayer, ConvLayer) {
-  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
-  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
-  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
-  // with groups
-  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
-}
-
-struct testPoolDesc {
-  int bs, ic;  // input channel and output channel are the same
-  int ih, iw;
-  int oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-};
-
-static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_pool");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-  pool->set_pool_type("avg-projection");
-  pool->set_channels(pm.ic);
-  pool->set_img_size(pm.iw);
-  pool->set_img_size_y(pm.ih);
-  pool->set_output_x(pm.ow);
-  pool->set_output_y(pm.oh);
-  pool->set_size_x(pm.fw);
-  pool->set_size_y(pm.fh);
-  pool->set_padding(pm.pw);
-  pool->set_padding_y(pm.ph);
-  pool->set_stride(pm.sw);
-  pool->set_stride_y(pm.sh);
-
-  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
-  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testPoolLayer(const testPoolDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNPoolConfig(dnnConfig, pm);
-  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
-  PoolConfig* pool = input->mutable_pool_conf();
-  for (auto type : {"max-projection", "avg-projection"}) {
-    pool->set_pool_type(type);
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
-  }
-}
-
-TEST(MKLDNNLayer, PoolLayer) {
-  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
-  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
-  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
-  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
-}
-
-struct testBatchNormDesc {
-  int bs;
-  int ic;
-  int ih, iw;
-};
-
-static void getMKLDNNBatchNormConfig(TestConfig& cfg,
-                                     const testBatchNormDesc& pm) {
-  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
-  cfg.layerConfig.set_type("mkldnn_batch_norm");
-  cfg.biasSize = pm.ic;
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.ic)});
-  cfg.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.add_inputs();
-  cfg.layerConfig.add_inputs();
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(pm.ic);
-  img_conf->set_img_size_y(pm.ih);
-  img_conf->set_img_size(pm.iw);
-}
-
-void testBatchNormLayer(const testBatchNormDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNBatchNormConfig(dnnConfig, pm);
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("batch_norm");
-  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
-  VLOG(MKLDNN_TESTS) << "check train phase";
-  dnnConfig.layerConfig.set_use_global_stats(false);
-  refConfig.layerConfig.set_use_global_stats(false);
-  MKLDNNTester tester;
-  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
-  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
-  VLOG(MKLDNN_TESTS) << "check test phase";
-  for (auto useGS : {false, true}) {
-    dnnConfig.layerConfig.set_use_global_stats(useGS);
-    refConfig.layerConfig.set_use_global_stats(useGS);
-    MKLDNNTester tester;
-    for (auto bs : {pm.bs, 1}) {
-      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
-    }
-  }
-}
-
-TEST(MKLDNNLayer, BatchNormLayer) {
-  testBatchNormLayer({4, 10, 6, 6});
-  testBatchNormLayer({16, 32, 16, 16});
-  testBatchNormLayer({4, 16, 8, 10});
-}
-
-struct testLRNDesc {
-  int bs, ic, ih, iw;
-  float scale, pow;
-  int localSize;
-};
-
-void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_lrn");
-  cfg.layerConfig.set_active_type("relu");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_channels(pm.ic);
-  norm->set_size(pm.localSize);
-  norm->set_scale(pm.scale);
-  norm->set_pow(pm.pow);
-  norm->set_blocked(0);
-  norm->set_img_size(pm.iw);
-  norm->set_img_size_y(pm.ih);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  cfg.layerConfig.set_size(layerSize);
-  cfg.biasSize = 0;
-}
-
-void testLRNLayer(const testLRNDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNLRNConfig(dnnConfig, pm);
-  // mkldnn_lrn <==> norm with cmrnorm-projection type
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("norm");
-  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cmrnorm-projection");
-  norm->set_scale(norm->scale() / norm->size());
-  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
-}
-
-TEST(MKLDNNLayer, LRNLayer) {
-  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
-  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
-  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
-}
-
-struct testImageDesc {
-  int bs, ic, ih, iw;
-};
-
-static void getAddtoConfig(TestConfig& cfg,
-                           const testImageDesc& pm,
-                           const size_t nInputs = 1) {
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("addto");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.layerConfig.set_size(layerSize);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < nInputs; ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(pm.ic);
-    img_conf->set_img_size_y(pm.ih);
-    img_conf->set_img_size(pm.iw);
-  }
-}
-
-void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
-  CHECK_GE(nInputs, 1UL);
-  TestConfig dnnConfig;
-  getAddtoConfig(dnnConfig, pm, nInputs);
-  dnnConfig.layerConfig.set_type("mkldnn_addto");
-  for (auto withBias : {false, true}) {
-    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
-  }
-}
-
-TEST(MKLDNNLayer, AddtoLayer) {
-  testAddtoLayer({16, 5, 14, 14}, 1);
-  testAddtoLayer({8, 10, 8, 8}, 2);
-  testAddtoLayer({4, 12, 1, 1}, 3);
-}
-
-static void getMKLDNNConcatConfig(TestConfig& cfg,
-                                  const std::vector<testImageDesc>& inputs) {
-  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
-  int oc = inputs[0].ic;
-  for (size_t i = 1; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i].bs, inputs[0].bs);
-    CHECK_EQ(inputs[i].ih, inputs[0].ih);
-    CHECK_EQ(inputs[i].iw, inputs[0].iw);
-    oc += inputs[i].ic;
-  }
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("mkldnn_concat");
-  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back(
-        {INPUT_DATA,
-         ss.str(),
-         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
-         0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(inputs[i].ic);
-    img_conf->set_img_size_y(inputs[i].ih);
-    img_conf->set_img_size(inputs[i].iw);
-  }
-}
-
-void testConcatLayer(const std::vector<testImageDesc>& inputs) {
-  TestConfig dnnConfig;
-  getMKLDNNConcatConfig(dnnConfig, inputs);
-  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
-}
-
-TEST(MKLDNNLayer, ConcatLayer) {
-  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
-  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
-}
-
-void testActivation(std::string actType, const testImageDesc& pm) {
-  // TODO(TJ): remove me when paddle support elu activation
-  if (actType == "mkldnn_elu") {
-    return;
-  }
-  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
-  TestConfig cfg;
-  getAddtoConfig(cfg, pm);
-  TestConfig ref = cfg;
-  cfg.layerConfig.set_active_type(compareTypes[0]);
-  ref.layerConfig.set_active_type(compareTypes[1]);
-  RUN_MKLDNN_TEST(cfg, ref, pm)
-}
-
-TEST(MKLDNNActivation, Activations) {
-  auto types = MKLDNNActivation::getAllRegisteredTypes();
-  for (auto type : types) {
-    /* bs, c, h, w*/
-    testActivation(type, {16, 64, 32, 32});
-    testActivation(type, {2, 8, 1, 1});
-  }
-}
-
-DECLARE_string(config_args);
-TEST(MKLDNNNet, net) {
-  std::vector<std::string> cases = {"simple", "branch"};
-  for (auto name : cases) {
-    std::string config = "./legacy/gserver/tests/mkldnn_" + name + "_net.conf";
-    for (auto channels : {2, 32}) {
-      std::ostringstream oss;
-      oss << "channels=" << channels;
-      FLAGS_config_args = oss.str();
-      MKLDNNTester::runNetTest(config);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = true;
-  initMain(argc, argv);
-  initPython(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
deleted file mode 100644
index 2bc261b4a..000000000
--- a/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(1);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(1);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
-                                       const string& poolType,
-                                       bool use_gpu,
-                                       MatrixPtr& maskMat) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(5);
-  pool->set_img_size_y(5);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  config.layerConfig.set_name("MaxPoolWithMask");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-
-  initDataLayer(config,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "MaxPoolWithMask",
-                1,
-                false,
-                use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
-
-  FLAGS_use_gpu = use_gpu;
-  std::vector<ParameterPtr> parameters;
-  LayerPtr maxPoolingWithMaskOutputLayer;
-  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
-
-  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
-                   maskMat);
-}
-
-TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
-  bool useGpu = false;
-  MatrixPtr inputMat;
-  MatrixPtr maskMat;
-  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
-                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
-                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
-  real maskData[] = {12, 4, 22, 24};
-
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->setData(inputData);
-  maskMat->setData(maskData);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#ifdef PADDLE_WITH_CUDA
-  useGpu = true;
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->copyFrom(inputData, 25);
-  maskMat->copyFrom(maskData, 4);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#endif
-}
diff --git a/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
deleted file mode 100644
index 25b1a1191..000000000
--- a/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include <gtest/gtest.h>
-#include <vector>
-
-#undef PADDLE_DISABLE_TIMER
-#include "paddle/legacy/utils/Stat.h"
-
-#include "paddle/legacy/gserver/layers/MultinomialSampler.h"
-#include "paddle/legacy/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-class MultinomialSamplerTester : public MultinomialSampler {
- public:
-  MultinomialSamplerTester(real* prob, int size)
-      : MultinomialSampler(prob, size) {}
-
-  template <typename Rand1>
-  int testGen(Rand1 rand1) {
-    return gen1(rand1);
-  }
-};
-
-TEST(MultinomialSampler, gen) {
-  int numGrids = 1024 * 1024;
-  int size = 1024 * 4;
-  default_random_engine reng;
-
-  for (size_t iter = 0; iter < 256; ++iter) {
-    uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
-    vector<real> prob;
-    int sum = 0;
-    for (int i = 0; i < size; ++i) {
-      prob.push_back(rand(reng));
-      sum += prob.back();
-    }
-
-    CHECK_LE(sum, numGrids);
-    prob.back() += numGrids - sum;
-
-    vector<int> counts(size);
-    MultinomialSamplerTester sampler(&prob[0], size);
-    counts.assign(size, 0);
-    {
-      double s = (double)size / (double)numGrids;
-      REGISTER_TIMER("MultinomialSampler");
-      for (double i = 0; i < numGrids; ++i) {
-        int ret = sampler.testGen([i, s]() { return s * i; });
-        if (ret < 0 || ret >= size) {
-          EXPECT_GE(ret, 0);
-          EXPECT_LT(ret, size);
-          break;
-        }
-        ++counts[ret];
-      }
-    }
-    for (int i = 0; i < size; ++i) {
-      if (prob[i] != counts[i]) {
-        EXPECT_EQ(prob[i], counts[i]);
-        LOG(INFO) << iter;
-        break;
-      }
-    }
-  }
-}
-
-void benchmarkRandom() {
-  int n = 1024 * 1024;
-
-  int sum;
-  double sum1;
-
-  sum = 0;
-  unsigned int seed = 1;
-  {
-    REGISTER_TIMER("crand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand_r(&seed) % 1000;
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  default_random_engine reng;
-  uniform_int_distribution<int> rand(1, 1000);
-  sum = 0;
-  {
-    REGISTER_TIMER("stdrand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand(reng);
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  sum = 0;
-  {
-    REGISTER_TIMER("default_random_engine");
-    for (int i = 0; i < n; ++i) {
-      sum += reng();
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  uniform_real_distribution<double> rand1(0, 1);
-  sum1 = 0;
-  {
-    REGISTER_TIMER("stdrand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += rand1(reng);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-
-  sum1 = 0;
-  {
-    real a = 1.0f / (real)RAND_MAX;
-    REGISTER_TIMER("crand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += a * rand_r(&seed);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  benchmarkRandom();
-  int ret = RUN_ALL_TESTS();
-  globalStat.printSegTimerStatus();
-  return ret;
-}
diff --git a/paddle/legacy/gserver/tests/test_NetworkCompare.cpp b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
deleted file mode 100644
index c9f9f3e61..000000000
--- a/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#undef PADDLE_DISABLE_TIMER
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/legacy/trainer/Trainer.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DEFINE_bool(use_label, true, "input label or sequence label");
-DEFINE_bool(static_para, false, "static parameter");
-
-struct DataIn {
-  std::vector<Argument> inArgs;
-  std::vector<MatrixPtr> outGrads;
-  std::vector<VectorPtr> paraValues;
-};
-
-struct DataOut {
-  std::vector<MatrixPtr> outValues;
-  std::vector<VectorPtr> paraGrads;
-};
-
-void initArgument(DataIn& data,
-                  const std::string& configPath,
-                  bool useGpu = FLAGS_use_gpu) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    Argument arg;
-    arg.value = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.value->randomizeUniform();
-    arg.value->add(-0.5);
-    arg.value->sigmoid(*arg.value);
-    arg.grad->zeroMem();
-    if (FLAGS_use_label) {
-      arg.ids = VectorT<int>::create(batchSize, useGpu);
-      arg.ids->rand(layerSize);
-    }
-    generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-    data.inArgs.push_back(arg);
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    MatrixPtr grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    grad->randomizeUniform();
-    data.outGrads.push_back(grad);
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), useGpu);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-
-  std::vector<ParameterPtr> parameters;
-  vector<Argument> outArgs;
-
-  auto gradientMachine = trainer.getGradientMachine();
-  parameters = gradientMachine->getParameters();
-  if (FLAGS_static_para) {
-    for (size_t i = 0; i < parameters.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->one();
-    }
-  } else {
-    for (size_t i = 0; i < in.paraValues.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-    }
-  }
-  gradientMachine->start();
-  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    // If the all the layers in the config have no parameters, also
-    // not set NeedGradient(), the outArgs[i] will be nullptr.
-    outArgs[i].grad->copyFrom(*in.outGrads[i]);
-  }
-  gradientMachine->backward();
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
-                                     outArgs[i].value->getWidth(),
-                                     false,
-                                     false);
-    value->copyFrom(*outArgs[i].value);
-    out.outValues.push_back(value);
-  }
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr grad = Vector::create(
-        parameters[i]->getBuf(PARAMETER_GRADIENT)->getSize(), false);
-    grad->copyFrom(*parameters[i]->getBuf(PARAMETER_GRADIENT));
-    out.paraGrads.push_back(grad);
-  }
-
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("forward");
-    gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  }
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("backward");
-    gradientMachine->backward();
-  }
-
-  gradientMachine->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_checkgrad_eps) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-}
-
-void compareGradient(DataOut& outA, DataOut& outB) {
-  LOG(INFO) << "------------------------------"
-            << " Check Network Output "
-            << "------------------------------";
-  for (size_t i = 0; i < outA.outValues.size(); ++i) {
-    LOG(INFO) << "OUTPUT VALUE: " << i;
-    checkBuffer(outA.outValues[i]->getData(),
-                "network A output",
-                outB.outValues[i]->getData(),
-                "network B output",
-                outA.outValues[i]->getElementCnt(),
-                outA.outValues[i]->getWidth());
-  }
-
-  if (!FLAGS_static_para) {
-    LOG(INFO) << "------------------------------"
-              << " Check Parameters "
-              << "------------------------------";
-    for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
-      LOG(INFO) << "PARAMETER GRADIENT: " << i;
-      checkBuffer(outA.paraGrads[i]->getData(),
-                  "Network A",
-                  outB.paraGrads[i]->getData(),
-                  "Network B",
-                  outA.paraGrads[i]->getSize());
-    }
-  }
-}
-
-void compareNetwork(const std::string& config_file_a,
-                    const std::string& config_file_b) {
-  DataIn in;
-  initArgument(in, config_file_a);
-
-  DataOut dataA;
-  calcGradient(in, dataA, config_file_a);
-  LOG(INFO) << "forwardBackward of Network A is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  DataOut dataB;
-  calcGradient(in, dataB, config_file_b);
-  LOG(INFO) << "forwardBackward of the Network B is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-TEST(Compare, concat_dotmul) {
-  std::string config_file_a = "./legacy/gserver/tests/concat_dotmul_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/concat_dotmul_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_fullmatrix) {
-  std::string config_file_a = "./legacy/gserver/tests/concat_fullmatrix_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/concat_fullmatrix_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_table) {
-  std::string config_file_a = "./legacy/gserver/tests/concat_table_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/concat_table_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_slice) {
-  std::string config_file_a = "./legacy/gserver/tests/concat_slice_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/concat_slice_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Compare, img_pool) {
-  std::string config_file_a = "./legacy/gserver/tests/img_pool_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/img_pool_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-TEST(Compare, img_conv) {
-  std::string config_file_a = "./legacy/gserver/tests/img_conv_a.conf";
-  std::string config_file_b = "./legacy/gserver/tests/img_conv_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-// Test cudnn_conv and exconv give the same result
-TEST(Compare, img_conv2) {
-  std::string config_file_a = "./legacy/gserver/tests/img_conv_cudnn.py";
-  std::string config_file_b = "./legacy/gserver/tests/img_conv_exconv.py";
-  bool useGpu = FLAGS_use_gpu;
-  double eps = FLAGS_checkgrad_eps;
-  FLAGS_use_gpu = true;
-  // Sometimes, this unit test will fail with 1e-2
-  FLAGS_checkgrad_eps = 4e-2;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-  FLAGS_checkgrad_eps = eps;
-}
-#endif
-
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-TEST(Compare, network) {
-  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
-    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/legacy/gserver/tests/test_PriorBox.cpp b/paddle/legacy/gserver/tests/test_PriorBox.cpp
deleted file mode 100644
index 10d512ec4..000000000
--- a/paddle/legacy/gserver/tests/test_PriorBox.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-// Do one forward pass of priorBox layer and check to see if its output
-// matches the given result
-void doOnePriorBoxTest(size_t feature_map_width,
-                       size_t feature_map_height,
-                       size_t image_width,
-                       size_t image_height,
-                       vector<int> min_size,
-                       vector<int> max_size,
-                       vector<real> aspect_ratio,
-                       vector<real> variance,
-                       bool use_gpu,
-                       MatrixPtr& result) {
-  // Setting up the priorbox layer
-  TestConfig configt;
-  configt.layerConfig.set_type("priorbox");
-
-  configt.inputDefs.push_back({INPUT_DATA, "featureMap", 1, 0});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  configt.inputDefs.push_back({INPUT_DATA, "image", 1, 0});
-  configt.layerConfig.add_inputs();
-  PriorBoxConfig* pb = input->mutable_priorbox_conf();
-  for (size_t i = 0; i < min_size.size(); i++) pb->add_min_size(min_size[i]);
-  for (size_t i = 0; i < max_size.size(); i++) pb->add_max_size(max_size[i]);
-  for (size_t i = 0; i < variance.size(); i++) pb->add_variance(variance[i]);
-  for (size_t i = 0; i < aspect_ratio.size(); i++)
-    pb->add_aspect_ratio(aspect_ratio[i]);
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
-  dataLayers[0]->getOutput().setFrameHeight(feature_map_height);
-  dataLayers[0]->getOutput().setFrameWidth(feature_map_width);
-  dataLayers[1]->getOutput().setFrameHeight(image_height);
-  dataLayers[1]->getOutput().setFrameWidth(image_width);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr priorboxLayer;
-  initTestLayer(configt, &layerMap, &parameters, &priorboxLayer);
-  priorboxLayer->forward(PASS_GC);
-  checkMatrixEqual(priorboxLayer->getOutputValue(), result);
-}
-
-TEST(Layer, priorBoxLayerFwd) {
-  vector<int> minSize;
-  vector<int> maxSize;
-  vector<real> aspectRatio;
-  vector<real> variance;
-  bool useGpu = false;
-
-  minSize.push_back(276);
-  maxSize.push_back(330);
-  variance.push_back(0.1);
-  variance.push_back(0.1);
-  variance.push_back(0.2);
-  variance.push_back(0.2);
-
-  // CPU case 1.
-  MatrixPtr result;
-  real resultData[] = {0.04,
-                       0.04,
-                       0.96,
-                       0.96,
-                       0.1,
-                       0.1,
-                       0.2,
-                       0.2,
-                       0,
-                       0,
-                       1,
-                       1,
-                       0.1,
-                       0.1,
-                       0.2,
-                       0.2};
-  result = Matrix::create(1, 2 * 8, false, useGpu);
-  result->setData(resultData);
-  doOnePriorBoxTest(/* feature_map_width */ 1,
-                    /* feature_map_height */ 1,
-                    /* image_width */ 300,
-                    /* image_height */ 300,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    result);
-  // CPU case 2.
-  variance[1] = 0.2;
-  variance[3] = 0.1;
-  maxSize.pop_back();
-  real resultData2[] = {0,     0,     0.595, 0.595, 0.1, 0.2, 0.2, 0.1,
-                        0.405, 0,     1,     0.595, 0.1, 0.2, 0.2, 0.1,
-                        0,     0.405, 0.595, 1,     0.1, 0.2, 0.2, 0.1,
-                        0.405, 0.405, 1,     1,     0.1, 0.2, 0.2, 0.1};
-  Matrix::resizeOrCreate(result, 1, 4 * 8, false, useGpu);
-  result->setData(resultData2);
-  doOnePriorBoxTest(/* feature_map_width */ 2,
-                    /* feature_map_height */ 2,
-                    /* image_width */ 400,
-                    /* image_height */ 400,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    result);
-  // CPU case 3.
-  aspectRatio.push_back(2);
-  real resultData3[] = {0.04,     0.04, 0.96, 0.96,       0.1,        0.2,
-                        0.2,      0.1,  0,    0.17473088, 1,          0.825269,
-                        0.1,      0.2,  0.2,  0.1,        0.17473088, 0,
-                        0.825269, 1,    0.1,  0.2,        0.2,        0.1};
-  Matrix::resizeOrCreate(result, 1, 3 * 8, false, useGpu);
-  result->setData(resultData3);
-  doOnePriorBoxTest(/* feature_map_width */ 1,
-                    /* feature_map_height */ 1,
-                    /* image_width */ 300,
-                    /* image_height */ 300,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    result);
-
-#ifdef PADDLE_WITH_CUDA
-  // reset the input parameters
-  variance[1] = 0.1;
-  variance[3] = 0.2;
-  maxSize.push_back(330);
-  aspectRatio.pop_back();
-  MatrixPtr resultGpu;
-  useGpu = true;
-  // GPU case 1.
-  resultGpu = Matrix::create(1, 2 * 8, false, useGpu);
-  resultGpu->copyFrom(resultData, 2 * 8);
-  doOnePriorBoxTest(/* feature_map_width */ 1,
-                    /* feature_map_height */ 1,
-                    /* image_width */ 300,
-                    /* image_height */ 300,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    resultGpu);
-  // GPU case 2.
-  variance[1] = 0.2;
-  variance[3] = 0.1;
-  maxSize.pop_back();
-  Matrix::resizeOrCreate(resultGpu, 1, 4 * 8, false, useGpu);
-  resultGpu->copyFrom(resultData2, 4 * 8);
-  doOnePriorBoxTest(/* feature_map_width */ 2,
-                    /* feature_map_height */ 2,
-                    /* image_width */ 400,
-                    /* image_height */ 400,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    resultGpu);
-  // GPU case 3.
-  aspectRatio.push_back(2);
-  Matrix::resizeOrCreate(resultGpu, 1, 3 * 8, false, useGpu);
-  resultGpu->copyFrom(resultData3, 3 * 8);
-  doOnePriorBoxTest(/* feature_map_width */ 1,
-                    /* feature_map_height */ 1,
-                    /* image_width */ 300,
-                    /* image_height */ 300,
-                    minSize,
-                    maxSize,
-                    aspectRatio,
-                    variance,
-                    useGpu,
-                    resultGpu);
-#endif
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
deleted file mode 100644
index 0209e6818..000000000
--- a/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "paddle/legacy/gserver/dataproviders/PyDataProvider.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std;     // NOLINT
-using namespace paddle;  // NOLINT
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu);
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num);
-
-TEST(PyDataProvider, py_fill_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleDataProvider"));
-  config.clear_files();
-  std::string dataFile =
-      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 6UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 4UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 2UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 2);
-}
-
-TEST(PyDataProvider, py_fill_nest_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleNestDataProvider"));
-  config.clear_files();
-  std::string dataFile =
-      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-  EXPECT_EQ(config.IsInitialized(), true);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 12UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 8UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 4UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 4);
-  // Check subSequenceStartPositions
-  EXPECT_EQ(argumentList[0].subSequenceStartPositions->getSize(), 4UL);
-  EXPECT_EQ(argumentList[1].subSequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[2].subSequenceStartPositions->getSize(), 4UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(0), 0);
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(1), 1);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 2);
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(3), 4);
-    } else {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 4);
-    }
-  }
-}
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
-  // Dense
-  real* data;
-  if (useGpu) {
-    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
-                                            argumentList[0].value->getWidth(),
-                                            0,
-                                            0);
-    cpuMatrixPtr->copyFrom(*argumentList[0].value);
-    data = cpuMatrixPtr->getData();
-  } else {
-    data = argumentList[0].value->getData();
-  }
-  for (size_t i = 0; i < argumentList[0].value->getElementCnt(); ++i) {
-    EXPECT_EQ(*(data + i), (float)(i % 3 + 1));
-  }
-  // Sparse without value
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    data = argumentList[0].value->getData();
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  for (size_t i = 0; i < argumentList[1].value->getHeight(); ++i) {
-    size_t colNum = useGpu ? matGpu->getColNum(i) : matCpu->getColNum(i);
-    EXPECT_EQ(colNum, (size_t)2);
-    const int* buf = useGpu ? matGpu->getRowCols(i) : matCpu->getRowCols(i);
-    for (size_t j = 0; j < colNum; ++j) {
-      EXPECT_EQ((size_t)buf[j], (size_t)(j + 1));
-    }
-  }
-  // Index
-  for (size_t j = 0; j < argumentList[2].ids->getSize(); ++j) {
-    EXPECT_EQ((size_t)argumentList[2].ids->get(j), 0UL);
-  }
-}
-
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num) {
-  EXPECT_EQ(argumentList[0].sequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[1].sequenceStartPositions->getSize(), 2UL);
-  EXPECT_EQ(argumentList[2].sequenceStartPositions->getSize(), 3UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(0), 0);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1), 1);
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(2),
-                sample_num);
-    } else {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1),
-                sample_num);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
deleted file mode 100644
index de313ba82..000000000
--- a/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_string(train_list, "unittest.list", "file list for unittest");
-
-namespace paddle {
-namespace unittest {
-namespace pydp2 {
-extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
-extern void clearOnPoolFilledHook();
-
-}  // namespace pydp2
-}  // namespace unittest
-}  // namespace paddle
-
-const paddle::real epsilon = 1e-5;
-
-static inline int64_t readDataBatch(paddle::DataBatch *batch,
-                                    const std::string &funcName,
-                                    int64_t batchSize = 65535) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object(funcName);
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  return provider->getNextBatchInternal(batchSize, batch);
-}
-
-TEST(PyDataProvider2, dense_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ(batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      size_t ii = i + 100;
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_EQ(num, 0);
-  }
-}
-
-TEST(PyDataProvider2, index_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_index_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(10000, &batch);
-    CHECK_EQ(num, 200);
-    for (int i = 0; i < 200; ++i) {
-      CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
-    }
-  }
-}
-
-TEST(PyDataProvider2, init_hook) {
-  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
-  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
-  PyDict_SetItemString(globals.get(), "pickle", pickle.get());
-  paddle::PyObjectPtr locals(PyDict_New());
-  paddle::PyObjectPtr mdl(PyRun_String(
-      "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
-      Py_file_input,
-      globals.get(),
-      locals.get()));
-  CHECK_PY(mdl) << "Error!";
-  paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
-  CHECK_PY(dps) << "Error!";
-
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_init_hook");
-  config.set_load_data_args(PyString_AsString(dps.get()));
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(100000, &batch);
-  ASSERT_EQ(num, 200);
-  auto &mat = batch.getStreams()[0].value;
-  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < 20; ++j) {
-      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_no_value_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_sparse_non_value_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(10000, &batch);
-  CHECK_EQ(num, 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    for (int j = 0; j < 10; ++j) {
-      CHECK_EQ(cols[j], (i + 1) * (j + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_value_no_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_sparse_value_no_seq"), 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    real *dat = csm->getRowValues(i);
-    for (int j = 0; j < 10; ++j) {
-      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
-      EXPECT_EQ(dat[j], real(j) / real(i + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, index_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
-    for (size_t j = 0; j < i + 1; ++j) {
-      ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
-      ++tmp;
-    }
-  }
-  ASSERT_EQ(arg.sequenceStartPositions->getSize(), (size_t)201);
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    tmp += i;
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i], tmp);
-  }
-  tmp += 200;
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[200], tmp);
-}
-
-TEST(PyDataProvider2, index_sub_seq) {
-  paddle::DataBatch batch;
-  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      for (size_t k = 0; k < j + 1; ++k) {
-        CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
-      }
-    }
-  }
-
-  CHECK_EQ(tmp, arg.ids->getSize());
-
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getSize(), (size_t)201);
-  ASSERT_EQ(arg.subSequenceStartPositions->getData(false)[0], 0);
-  ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
-  size_t idx = 1;
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      tmp += j + 1;
-      ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
-                (size_t)tmp);
-      ++idx;
-    }
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
-  }
-}
-
-TEST(PyDataProvider2, min_pool_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size");
-  config.set_load_data_args("");
-  size_t totalData = 1 << 14;
-  constexpr size_t batchSize = 100;
-  constexpr size_t minPoolSize = 1000;
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-
-  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
-    if (totalData > batchSize) {
-      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
-    }
-  });
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      totalData -= realBatchSize;
-    } else {
-      break;
-    }
-  }
-  paddle::unittest::pydp2::clearOnPoolFilledHook();
-}
-
-TEST(PyDataProvider2, can_over_batch_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_can_over_batch_size");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
-    } else {
-      break;
-    }
-  }
-}
-
-TEST(PyDataProvider2, input_order) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_input_order");
-  config.set_load_data_args("");
-
-  paddle::ModelConfig modelConfig;
-  *modelConfig.add_input_layer_names() = "input1";
-  *modelConfig.add_input_layer_names() = "input2";
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, modelConfig, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (!realBatchSize) {
-      break;
-    }
-    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
-    for (int64_t i = 0; i < realBatchSize; ++i) {
-      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
-      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
-    }
-  }
-}
-
-TEST(PyDataProvider2, test_check) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_check");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
-    if (!realBatchSize) {
-      break;
-    } else {
-      auto &ivec = batch.getStream(0).ids;
-      for (size_t i = 0; i < ivec->getSize(); ++i) {
-        CHECK_LT(ivec->getData()[i], 10);
-      }
-    }
-  }
-}
-
-TEST(PyDataProvider2, multiThread) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  paddle::DataBatch batch;
-  provider->getNextBatch(100, &batch);
-  provider->reset();
-  provider.reset();
-}
-
-TEST(PyDataProvider2, minPoolSizeWithCache) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size_with_cache");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  paddle::DataBatch batch;
-
-  for (int i = 0; i < 10; ++i) {
-    provider->reset();
-    int64_t sum = 0;
-    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
-      sum += actualNum;
-    }
-    ASSERT_EQ(1 << 20, sum);
-  }
-}
-
-int main(int argc, char **argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-
-  std::ofstream fout(FLAGS_train_list);
-  CHECK(fout.is_open());
-  fout << "stub file name" << std::endl;  // in unittest, filename is not used.
-  fout.close();
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider2.py b/paddle/legacy/gserver/tests/test_PyDataProvider2.py
deleted file mode 100644
index 461d80b9e..000000000
--- a/paddle/legacy/gserver/tests/test_PyDataProvider2.py
+++ /dev/null
@@ -1,125 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-import random
-
-from paddle.trainer.PyDataProvider2 import *
-
-
-@provider(slots=[dense_vector(200, seq_type=SequenceType.NO_SEQUENCE)])
-def test_dense_no_seq(setting, filename):
-    for i in xrange(200):
-        yield [(float(j - 100) * float(i + 1)) / 200.0 for j in xrange(200)]
-
-
-@provider(input_types=[integer_value(200, seq_type=SequenceType.NO_SEQUENCE)])
-def test_index_no_seq(setting, filename):
-    for i in xrange(200):
-        yield i
-
-
-def test_init_hooker(setting, value, **kwargs):
-    setting.value = value
-
-
-@provider(
-    input_types=[dense_vector(
-        20, seq_type=SequenceType.NO_SEQUENCE)],
-    init_hook=test_init_hooker)
-def test_init_hook(setting, filename):
-    for i in xrange(200):
-        yield setting.value
-
-
-@provider(input_types=[
-    sparse_binary_vector(
-        30000, seq_type=SequenceType.NO_SEQUENCE)
-])
-def test_sparse_non_value_no_seq(setting, filename):
-    for i in xrange(200):
-        yield [(i + 1) * (j + 1) for j in xrange(10)]
-
-
-@provider(input_types=[
-    sparse_float_vector(
-        30000, seq_type=SequenceType.NO_SEQUENCE)
-])
-def test_sparse_value_no_seq(setting, filename):
-    for i in xrange(200):
-        yield [((i + 1) * (j + 1), float(j) / float(i + 1)) for j in xrange(10)]
-
-
-@provider(input_types=[integer_value(200, seq_type=SequenceType.SEQUENCE)])
-def test_index_seq(setting, filename):
-    for i in xrange(200):
-        yield range(i + 1)
-
-
-@provider(input_types=[index_slot(200, seq_type=SequenceType.SUB_SEQUENCE)])
-def test_index_sub_seq(setting, filename):
-    def gen_sub_seq(l):
-        l += 1
-        for j in xrange(l):
-            yield range(j + 1)
-
-    for i in xrange(200):
-        yield list(gen_sub_seq(i))
-
-
-@provider(input_types=[index_slot(100)], min_pool_size=1000)
-def test_min_pool_size(setting, filename):
-    for _ in xrange(1 << 14):
-        yield random.randint(0, 100 - 1)
-
-
-@provider(
-    input_types=[index_slot(
-        100, seq_type=SequenceType.SEQUENCE)],
-    can_over_batch_size=False,
-    calc_batch_size=lambda x: len(x[0]))
-def test_can_over_batch_size(setting, filename):
-    for _ in xrange(1 << 10):
-        seq_len = random.randint(0, 99)
-        yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
-
-
-@provider(input_types={'input1': index_slot(10), 'input2': index_slot(10)})
-def test_input_order(setting, filename):
-    for _ in xrange(1000):
-        yield {'input1': 0, 'input2': 1}
-
-
-@provider(
-    input_types=[index_slot(10)],
-    check=True,
-    check_fail_continue=True,
-    should_shuffle="123")  # also test should shuffle
-def test_check(settings, filename):
-    yield_good_value = False
-
-    while not yield_good_value:
-        for _ in xrange(10000):
-            i = random.randint(0, 100)
-            if i < 10:
-                yield_good_value = True
-            yield i
-
-
-@provider(
-    input_types=[index_slot(10)],
-    min_pool_size=1000,
-    cache=CacheType.CACHE_PASS_IN_MEM, )
-def test_min_pool_size_with_cache(settings, filename):
-    import random
-    for _ in xrange(2**20):
-        yield random.randint(0, 9)
diff --git a/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
deleted file mode 100644
index 153c3e7f3..000000000
--- a/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
-#include <paddle/legacy/trainer/Trainer.h>
-#include <paddle/legacy/trainer/TrainerInternal.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <paddle/legacy/utils/Util.h>
-#include <paddle/legacy/utils/Version.h>
-
-DECLARE_int32(seed);
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-class TrainerForTest : public paddle::Trainer {
- public:
-  void startTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.start();
-  }
-
-  void finishTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.finish();
-  }
-
-  /**
-   * Get total dimension of all parameters.
-   *
-   * @return the total dimension of all parameters
-   */
-  size_t getTotalParameterSize() const {
-    auto p = const_cast<TrainerForTest*>(this);
-    auto& params = p->getGradientMachine()->getParameters();
-    return std::accumulate(
-        params.begin(), params.end(), 0UL, [](size_t a, const ParameterPtr& p) {
-          return a + p->getSize();
-        });
-  }
-};
-
-void CalCost(const string& conf,
-             const string& dir,
-             real* cost,
-             int num_passes) {
-  auto config = std::make_shared<TrainerConfigHelper>(conf);
-  TrainerForTest trainer;
-  trainer.init(config);
-  mkDir(dir.c_str());
-  config->setSaveDir(dir);
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = config->getOptConfig().batch_size();
-  real learningRate = config->getOptConfig().learning_rate();
-  real momentum = 0;
-  real decayRate = 0;
-  int64_t dim = trainer.getTotalParameterSize();
-  CpuVector vecW(dim);
-  CpuVector vecGradient(dim);
-  CpuVector vecMomentum(dim);
-
-  // vecW needs to be assigned, otherwise the variable is an uncertain value.
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  vecW.randnorm(0, 0.1);
-  vecMomentum.randnorm(0, 0.1);
-
-  trainer.startTrain();
-  for (int i = 0; i < num_passes; ++i) {
-    real totalCost = 0;
-    dataProvider->reset();
-    while (true) {
-      DataBatch dataBatch;
-      int num = dataProvider->getNextBatch(batchSize, &dataBatch);
-      if (num == 0) break;
-      totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
-      sgdUpdate(
-          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
-    }
-    cost[i] = totalCost;
-  }
-  trainer.finishTrain();
-  rmDir(dir.c_str());
-}
-
-void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
-  if (!paddle::version::isWithGpu() && useGpu) {
-    return;
-  }
-  FLAGS_use_gpu = useGpu;
-  int num_passes = 5;
-  real* cost1 = new real[num_passes];
-  const string dir1 = "legacy/gserver/tests/t1";
-  CalCost(conf1, dir1, cost1, num_passes);
-
-  real* cost2 = new real[num_passes];
-  const string dir2 = "legacy/gserver/tests/t2";
-  CalCost(conf2, dir2, cost2, num_passes);
-
-  for (int i = 0; i < num_passes; i++) {
-    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
-              << ", cost2=" << cost2[i]
-              << ", diff=" << std::abs(cost1[i] - cost2[i]);
-    ASSERT_NEAR(cost1[i], cost2[i], eps);
-  }
-  delete[] cost1;
-  delete[] cost2;
-}
-
-TEST(RecurrentGradientMachine, HasSubSequence) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_layer_group.conf",
-         "legacy/gserver/tests/sequence_nest_layer_group.conf",
-         1e-5,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_rnn.conf",
-         "legacy/gserver/tests/sequence_nest_rnn.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_input) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_rnn_multi_input.conf",
-         "legacy/gserver/tests/sequence_nest_rnn_multi_input.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
-         "legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_mixed_input) {
-  for (bool useGpu : {false, true}) {
-    test("legacy/gserver/tests/sequence_rnn_mixed_inputs.py",
-         "legacy/gserver/tests/sequence_rnn_matched_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-
-  if (paddle::version::isWithPyDataProvider()) {
-    if (!paddle::version::isWithGpu()) {
-      FLAGS_use_gpu = false;
-    }
-    initMain(argc, argv);
-    initPython(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
-  }
-}
diff --git a/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
deleted file mode 100644
index 71198cb6a..000000000
--- a/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
+++ /dev/null
@@ -1,571 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/Version.h>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-DECLARE_bool(use_gpu);
-DECLARE_bool(rnn_use_batch);
-DECLARE_int32(fixed_seq_length);
-
-void checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkError(const CpuVector& vector1, const CpuVector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int size = vector1.getSize();
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  int count = 0;
-  for (int i = 0; i < size; i++) {
-    if (fabs(data1[i] - data2[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        int layerSize,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.grad = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.value->sigmoid(*data.value);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-ParameterPtr creatParameter(string name,
-                            int pid,
-                            size_t paraSize,
-                            bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->enableType(PARAMETER_GRADIENT);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-ParameterPtr creatParameterBias(string name,
-                                int pid,
-                                size_t paraSize,
-                                bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-  paraConfig.set_initial_std(1);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ true);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-LayerPtr initRecurrentLayer(LayerConfig layerConfig,
-                            size_t batchSize,
-                            int layerSize,
-                            bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  LayerPtr dataLayer = creatDataLayer("layer_0", batchSize, layerSize, useGpu);
-  layerMap[dataLayer->getName()] = dataLayer;
-
-  ParameterPtr para =
-      creatParameter("para_0", 0, layerSize * layerSize, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkRecurrentLayer(LayerPtr testLayer) {
-  const VectorPtr& weightGrad =
-      (testLayer->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer->getPrev(0)->getOutputGrad();
-  CpuVector seqPara(weightGrad->getSize());
-  CpuVector batPara(weightGrad->getSize());
-  CpuMatrix seqInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix batInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-
-  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  outputGrad.randomizeUniform();
-
-  /* use sequence calculate */
-  FLAGS_rnn_use_batch = false;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  seqPara.copyFrom(*weightGrad);
-  seqInputGrad.copyFrom(*inputGrad);
-
-  /* use batch calculate */
-  FLAGS_rnn_use_batch = true;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  batPara.copyFrom(*weightGrad);
-  batInputGrad.copyFrom(*inputGrad);
-
-  /* check */
-  checkError(seqInputGrad, batInputGrad);
-  checkError(seqPara, batPara);
-}
-
-TEST(Layer, RecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_name("rnn");
-  layerConfig.set_type("recurrent");
-  layerConfig.set_active_type("tanh");
-  for (auto layerSize : {1, 10, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 20, 100, 128}) {
-      for (auto useGpu : {false, true}) {
-        for (auto reversed : {false, true}) {
-          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " useGpu=" << useGpu << " reversed=" << reversed;
-          layerConfig.set_size(layerSize);
-          layerConfig.set_reversed(reversed);
-          LayerPtr testLayer =
-              initRecurrentLayer(layerConfig, batchSize, layerSize, useGpu);
-          checkRecurrentLayer(testLayer);
-        }
-      }
-    }
-  }
-}
-
-#define protected public
-#include "paddle/legacy/gserver/layers/GatedRecurrentLayer.h"
-#include "paddle/legacy/gserver/layers/LstmLayer.h"
-#include "paddle/legacy/gserver/layers/RecurrentLayer.h"
-template <class T>
-class TestRecurrentLayer {
- public:
-  LayerConfig config_;
-  bool useGpu_;
-  bool useBatch_;
-  LayerPtr testLayer_;
-  LayerPtr dataLayer_;
-  ParameterPtr para_;
-  ParameterPtr bias_;
-  LayerMap layerMap_;
-  ParameterMap parameterMap_;
-  TestRecurrentLayer(const LayerConfig& config,
-                     bool useGpu,
-                     bool useBatch = false)
-      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
-  void init(size_t batchSize) {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_ = Layer::create(config_);
-    if (typeid(T) == typeid(GatedRecurrentLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 3,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 3,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
-    } else if (typeid(T) == typeid(LstmLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 4,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 4,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
-    }
-    layerMap_[dataLayer_->getName()] = dataLayer_;
-    parameterMap_[para_->getName()] = para_;
-    parameterMap_[bias_->getName()] = bias_;
-
-    layerMap_[testLayer_->getName()] = testLayer_;
-    testLayer_->init(layerMap_, parameterMap_);
-    testLayer_->setNeedGradient(true);
-    (dynamic_cast<T*>(testLayer_.get()))->useBatch_ = useBatch_;
-  }
-  void forward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->forward(PASS_GC);
-  }
-  void backward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->backward(nullptr);
-  }
-};
-
-template <class T>
-void checkRecurrentLayer(LayerConfig layerConfig,
-                         size_t batchSize,
-                         bool cpuBatch,
-                         bool gpuBatch) {
-  TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
-  TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
-  testCpu.init(batchSize);
-  testGpu.init(batchSize);
-  auto checkError = [](
-      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
-    CpuMatrix check(gpu->getHeight(), gpu->getWidth());
-    check.copyFrom(*gpu);
-    int height = cpu->getHeight();
-    int width = cpu->getWidth();
-    const real* data1 = cpu->getData();
-    const real* data2 = check.getData();
-    int count = 0;
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        if (fabs(data1[i * width + j] - data2[i * width + j]) / numSequences >
-            1e-4) {
-          count++;
-        }
-      }
-    }
-    EXPECT_EQ(count, 0) << "[" << str << "]"
-                        << "There are " << count << " different element.";
-  };
-  T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
-  T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
-
-  Argument& cpuInput = testCpu.dataLayer_->getOutput();
-  Argument& gpuInput = testGpu.dataLayer_->getOutput();
-  gpuInput.resizeAndCopyFrom(cpuInput, true);
-
-  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
-  gpuVec->copyFrom(*cpuVec);
-
-  const VectorPtr& cpuBiasVec = testCpu.bias_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuBiasVec = testGpu.bias_->getBuf(PARAMETER_VALUE);
-  gpuBiasVec->copyFrom(*cpuBiasVec);
-
-  /* check forward */
-  testCpu.forward();
-  testGpu.forward();
-
-  checkError(
-      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
-
-  /* check backward */
-  cpuLayer->getOutputGrad()->randomizeUniform();
-  gpuLayer->getOutputGrad()->copyFrom(*cpuLayer->getOutputGrad());
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  testCpu.backward();
-  testGpu.backward();
-
-  // check input grad
-  checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
-  // check weight grad
-  int numSequences = cpuInput.getNumSequences();
-  checkError(cpuLayer->weight_->getWGrad(),
-             gpuLayer->weight_->getWGrad(),
-             numSequences,
-             "weightGrad");
-  // check bias grad
-  checkError(cpuLayer->bias_->getWGrad(),
-             gpuLayer->bias_->getWGrad(),
-             numSequences,
-             "biasGrad");
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("gated_recurrent");
-  layerConfig.set_active_type("sigmoid");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<GatedRecurrentLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("lstmemory");
-  layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("tanh");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<LstmLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-#ifdef PADDLE_WITH_MKLML
-
-#include "paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h"
-
-LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
-                            bool reversed,
-                            int layerSize,
-                            LayerPtr dataLayer,
-                            ParameterPtr para,
-                            ParameterPtr bias = nullptr) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  parameterMap[para->getName()] = para;
-  if (bias) {
-    parameterMap[bias->getName()] = bias;
-    layerConfig.set_bias_parameter_name("bias_0");
-  }
-
-  layerConfig.set_size(layerSize);
-  layerConfig.set_reversed(reversed);
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkMKLPackedLayer(LayerConfig layerConfig1,
-                         LayerConfig layerConfig2,
-                         bool reversed,
-                         int layerSize,
-                         int batchSize,
-                         bool useBatch1,
-                         bool useBatch2) {
-  LayerPtr dataLayer;
-  ParameterPtr para, bias;
-
-  if (layerConfig1.type() == "recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize, false);
-    bias = nullptr;
-  } else if (layerConfig1.type() == "gated_recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
-    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
-  }
-
-  LayerPtr testLayer1 = initMKLPackedLayer(
-      layerConfig1, reversed, layerSize, dataLayer, para, bias);
-  LayerPtr testLayer2 = initMKLPackedLayer(
-      layerConfig2, reversed, layerSize, dataLayer, para, bias);
-
-  const VectorPtr& weightGrad =
-      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
-  CpuVector wgt_grad1(weightGrad->getSize());
-  CpuVector wgt_grad2(weightGrad->getSize());
-  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
-
-  for (int i = 0; i < 2; i++) {
-    FLAGS_rnn_use_batch = useBatch1;
-
-    testLayer1->forward(PASS_GC);
-
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->forward(PASS_GC);
-
-    testLayer1->getOutputGrad()->randomizeUniform();
-    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch1;
-    testLayer1->backward(nullptr);
-
-    wgt_grad1.copyFrom(*weightGrad);
-    input_grad1.copyFrom(*inputGrad);
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->backward(nullptr);
-
-    wgt_grad2.copyFrom(*weightGrad);
-    input_grad2.copyFrom(*inputGrad);
-
-    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
-    checkError(wgt_grad1, wgt_grad2);
-    checkError(input_grad1, input_grad2);
-  }
-}
-
-TEST(MKLPackedLayer, RecurrentLayer) {
-  LayerConfig layerConfig1;
-  LayerConfig layerConfig2;
-
-  layerConfig1.set_name("paddle-rnn");
-  layerConfig1.set_type("recurrent");
-  layerConfig1.set_active_type("relu");
-
-  layerConfig2.set_name("mkl-packed-rnn");
-  layerConfig2.set_type("mkl_packed_recurrent");
-  layerConfig2.set_active_type("relu");
-
-  FLAGS_use_gpu = false;
-
-  for (auto layerSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {true, false}) {
-        for (auto paddle_use_batch : {true, false}) {
-          for (auto MKLPacked_use_batch : {true, false}) {
-            LOG(INFO) << " layerSize=" << layerSize
-                      << " batchSize=" << batchSize << " reversed=" << reversed
-                      << " paddle_use_batch=" << paddle_use_batch
-                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
-
-            checkMKLPackedLayer(layerConfig1,
-                                layerConfig2,
-                                reversed,
-                                layerSize,
-                                batchSize,
-                                paddle_use_batch,
-                                MKLPacked_use_batch);
-          }
-        }
-      }
-    }
-  }
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  if (!version::isWithGpu()) {
-    testing::GTEST_FLAG(filter) = "-Layer.*";
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
deleted file mode 100644
index 1975d9196..000000000
--- a/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
+++ /dev/null
@@ -1,471 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <math.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-#include <ctime>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/FullyConnectedLayer.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-#include "paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h"
-#include "paddle/legacy/math/CpuSparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(num_passes);
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_string(config_args);
-
-size_t fcLayerWidth = 1024;
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-int randint(int* data, size_t int_max, size_t size) {
-  srand((size_t)(time(NULL)));
-  if (int_max < size) {
-    return -1;
-  }
-  size_t count = 0;
-  std::map<int, int> tmp;
-  int this_int = 0;
-
-  while (count < size) {
-    this_int = std::rand() % int_max;  // NOLINT
-    if (tmp.find(this_int) == tmp.end()) {
-      tmp[this_int] = 0;
-      count += 1;
-    }
-  }
-
-  if (tmp.size() != size) {
-    return -1;
-  }
-  count = 0;
-  for (auto itr = tmp.begin(); itr != tmp.end(); ++itr) {
-    data[count] = itr->first;
-    count += 1;
-  }
-  return 0;
-}
-
-void calcOutput(ComData& comData,
-                const string configFile,
-                const string configArgs,
-                bool useGpu) {
-  FLAGS_config = configFile;
-  FLAGS_config_args = configArgs;
-  FLAGS_use_gpu = useGpu;
-  FLAGS_init_model_path = "legacy/gserver/tests/SelectiveFcTest/model";
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlags(), false);
-
-  comData.parameters = trainer.getGradientMachine()->getParameters();
-
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-  DataBatch dataBatch;
-  dataProvider->setSkipShuffle();
-  dataProvider->reset();
-  dataProvider->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-
-  vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &comData.outArgs, PASS_TRAIN);
-  trainer.getGradientMachine()->finish();
-}
-
-void checkMatrix(real* A, real* B, size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  int diffNum = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
-        std::isnan(B[i])) {
-    } else if (fabs(A[i] - B[i]) > err) {
-      diffNum++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void checkTranspose(real* matrix,
-                    real* transpose,
-                    size_t width,
-                    size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t height = matSize / width;
-  int diffNum = 0;
-  size_t rowId = 0;
-  size_t colId = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (i % width == 0 && i) {
-      rowId++;
-    }
-    colId = i % width;
-    if (fabs(matrix[i] - transpose[colId * height + rowId]) > err) {
-      diffNum++;
-      LOG(INFO) << i << " diff : " << matrix[i] << "\t"
-                << transpose[colId * height + rowId];
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void compareOutput(ComData& fcData, ComData& selFcData) {
-  vector<Argument> outArgsFc = fcData.outArgs;
-  vector<Argument> outArgsSelfc = selFcData.outArgs;
-
-  // check cost
-  LOG(INFO) << "Check cost";
-  CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
-                   outArgsFc[0].value->getWidth());
-  CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
-                      outArgsSelfc[0].value->getWidth());
-  fcCost.copyFrom(*outArgsFc[0].value);
-  selfcCost.copyFrom(*outArgsSelfc[0].value);
-  checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
-
-  // check selective fc output and fc output
-  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
-            << "with FullyConectedLayer";
-  CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
-                  outArgsFc[1].value->getWidth());
-  CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
-                     outArgsSelfc[1].value->getWidth());
-
-  fcOut.copyFrom(*outArgsFc[1].value);
-  selfcOut.copyFrom(*outArgsSelfc[1].value);
-  checkMatrix(fcOut.getData(), selfcOut.getData(), fcOut.getElementCnt());
-
-  // check gradient math
-  vector<ParameterPtr>& fcParam = fcData.parameters;
-  vector<ParameterPtr>& selfcParam = selFcData.parameters;
-  for (size_t i = 0; i < fcParam.size(); ++i) {
-    ParameterPtr p1, p2;
-    p1 = fcParam[i];
-    p2 = selfcParam[i];
-
-    string paramName = p1->getName();
-    LOG(INFO) << "check parameter : " << paramName;
-
-    // check parameter value
-    CpuVector paraValue1(p1->getSize());
-    CpuVector paraValue2(p2->getSize());
-    paraValue1.copyFrom(*p1->getBuf(PARAMETER_VALUE));
-    paraValue2.copyFrom(*p2->getBuf(PARAMETER_VALUE));
-
-    // check gradient
-    CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
-    CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
-    if (paramName == "rand_fc_param.bias") {
-      checkMatrix(
-          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
-      checkMatrix(
-          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
-    } else {
-      checkTranspose(paraValue1.getData(),
-                     paraValue2.getData(),
-                     fcLayerWidth,
-                     paraValue1.getSize());
-      checkTranspose(paraGrad1.getData(),
-                     paraGrad2.getData(),
-                     fcLayerWidth,
-                     paraGrad1.getSize());
-    }
-  }
-}
-
-void compareSparseMulOutput(
-    real* fcOutput,
-    real* selOutput,
-    size_t nnz,
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t nnzCount =
-      std::accumulate(selCols->begin(),
-                      selCols->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-  EXPECT_EQ(nnz, nnzCount);
-
-  size_t sampleNum = selCols->size();
-  int diffNum = 0;
-  size_t count = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    for (size_t j = 0; j < (*selCols)[i].second; ++j) {
-      size_t selIdx = (*selCols)[i].first[j];
-      if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
-        diffNum++;
-        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
-                  << "\t" << selOutput[count];
-      }
-      count++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        size_t layerSize,
-                        std::vector<real>& values,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->copyFrom(values.data(), batchSize * layerSize);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_TEST);
-  return layer;
-}
-
-ParameterPtr creatParameter(
-    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->randomize();
-  parameter->setID(pid);
-  parameter->load(paramFile);
-  return parameter;
-}
-
-LayerPtr initFcLayer(LayerPtr dataLayer,
-                     LayerConfig layerConfig,
-                     int dataLayerSize,
-                     int fcLayerSize,
-                     string paraName,
-                     string paraFile,
-                     bool useGpu) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-
-  layerMap[dataLayer->getName()] = dataLayer;
-  ParameterPtr para = creatParameter(
-      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name(dataLayer->getName());
-  input.set_input_parameter_name(paraName);
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->setNeedGradient(false);
-  testLayer->init(layerMap, parameterMap);
-  return testLayer;
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in fc.conf and selective_fc.conf is float
-TEST(Layer, SelectiveFcLayer_train_dense_mul) {
-  const string& fcConfig = "legacy/gserver/tests/SelectiveFcTest/conf/fc.conf";
-  const string& fcConfigArgs =
-      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
-  const string& selFcConfig =
-      "legacy/gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
-  const string& selConfigArgs =
-      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
-
-  for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-    if (useGpu) {
-      break;
-    }
-#endif
-    LOG(INFO) << "FullyConnectedLayer forwardBackward()";
-    ComData fcData;
-    calcOutput(fcData, fcConfig, fcConfigArgs, useGpu);
-
-    LOG(INFO) << "SelectiveFullyConnectedLayer forwardBackward()";
-    ComData selFcData;
-    calcOutput(selFcData, selFcConfig, selConfigArgs, useGpu);
-    compareOutput(fcData, selFcData);
-  }
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
-                                        bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  size_t batchSize = 100;
-  size_t dataLayerSize = 512;
-  std::vector<real> values(batchSize * dataLayerSize);
-  for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
-    values[j] = std::rand() / real(RAND_MAX);
-  }
-  LayerPtr dataLayer =
-      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
-
-  const string& selfcParaFile =
-      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
-  const string& selfcParaName = "rand_fc_param.w.transpose";
-
-  std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
-      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
-          initFcLayer(dataLayer,
-                      config,
-                      dataLayerSize,
-                      fcLayerWidth,
-                      selfcParaName,
-                      selfcParaFile,
-                      useGpu));
-
-  // create selected columns
-  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
-      new std::vector<std::pair<int*, size_t>>(batchSize));
-  size_t maxNNZ = 30;
-  srand((size_t)(time(NULL)));
-  int total = 0;
-  while (total == 0) {
-    for (size_t i = 0; i < batchSize; ++i) {
-      size_t num = std::rand() % maxNNZ;
-      int* data = new int[num];
-      randint(data, fcLayerWidth, num);
-      (*selCols)[i] = std::make_pair(data, num);
-      total += num;
-    }
-  }
-  selfcLayer->fillSelectiveData(selCols);
-  selfcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
-  CpuSparseMatrixPtr cpuOutMatSelfc(
-      new CpuSparseMatrix(outMatSelfc->getHeight(),
-                          outMatSelfc->getWidth(),
-                          outMatSelfc->getElementCnt()));
-  cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueSelfc = cpuOutMatSelfc->getValue();
-  size_t nnz = cpuOutMatSelfc->getElementCnt();
-
-  const string& fcParaFile =
-      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
-  const string& fcParaName = "rand_fc_param.w";
-  LayerConfig fcLayerConfig;
-  fcLayerConfig.set_name("fc_layer");
-  fcLayerConfig.set_type("fc");
-  fcLayerConfig.set_active_type("linear");
-  fcLayerConfig.set_size(fcLayerWidth);
-
-  LayerPtr fcLayer = initFcLayer(dataLayer,
-                                 fcLayerConfig,
-                                 dataLayerSize,
-                                 fcLayerWidth,
-                                 fcParaName,
-                                 fcParaFile,
-                                 useGpu);
-  fcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatFc = fcLayer->getOutputValue();
-  MatrixPtr cpuOutMatFc(
-      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
-  cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueFc = cpuOutMatFc->getData();
-
-  compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
-  for (size_t i = 0; i < batchSize; ++i) {
-    delete[](*selCols)[i].first;
-  }
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in testSelectiveFcLayerTrainSparseMul is float
-TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
-  LayerConfig selLayerConfig;
-  selLayerConfig.set_name("sel_fc");
-  selLayerConfig.set_type("selective_fc");
-  selLayerConfig.set_active_type("linear");
-  selLayerConfig.set_has_selected_colums(false);
-  selLayerConfig.set_selective_fc_pass_generation(true);
-  selLayerConfig.set_size(fcLayerWidth);
-
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifdef PADDLE_WITH_CUDA
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
-#endif
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-// TODO(dangqingqing) test multi threads after support in matrix
-// TEST(Layer, SelectiveFcLayer_train_sparse_mul_parallel) {
-//   LayerConfig selLayerConfig;
-//   selLayerConfig.set_name("sel_fc");
-//   selLayerConfig.set_type("selective_fc");
-//   selLayerConfig.set_active_type("linear");
-//   selLayerConfig.set_has_selected_colums(false);
-//   selLayerConfig.set_selective_fc_pass_generation(true);
-//   selLayerConfig.set_selective_fc_parallel_plain_mul_thread_num(10);
-//   selLayerConfig.set_selective_fc_full_mul_ratio(1000);
-//   selLayerConfig.set_size(fcLayerWidth);
-//   SelectiveFcLayer_test(selLayerConfig, false);
-// }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
deleted file mode 100644
index 05acd7142..000000000
--- a/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const int MAX_SEQ_NUM = 17;
-const int MAX_SEQ_LEN = 23;
-const int MAX_BEAM_SIZE = 13;
-
-const size_t SEED = (size_t)(time(NULL));
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
-  seqStartPos.resize(1, 0);
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int j = 0; j < subSeqNum; ++j)
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % MAX_SEQ_LEN)));
-    seqStartPos.push_back(subSeqStartPos.back());
-  }
-}
-
-/*
-  generate start indices according to sequence start positions.
- */
-void genStarts(vector<int>& seqStartPos,
-               vector<vector<real>>& starts,
-               size_t beamSize) {
-  starts.clear();
-  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    vector<real> randStarts =
-        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
-    copy(begin(randStarts), end(randStarts), begin(starts[i]));
-  }
-}
-
-/*
-  generate end indices according to sequence start positions and start indices.
- */
-void genEnds(vector<int>& seqStartPos,
-             vector<vector<real>>& starts,
-             vector<vector<real>>& ends,
-             size_t beamSize) {
-  CHECK_EQ(seqStartPos.size() - 1, starts.size());
-  ends.clear();
-  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < starts.size(); ++i) {
-    for (size_t j = 0; j < starts[i].size(); ++j) {
-      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-      CHECK_GE(seqLen - 1, starts[i][j]);
-      if (starts[i][j] == -1.) break;
-      if (starts[i][j] == (seqLen - 1)) {
-        ends[i][j] = starts[i][j];
-      } else {
-        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
-      }
-    }
-  }
-}
-
-void genTestData(vector<int>& seqStartPos,
-                 vector<int>& subSeqStartPos,
-                 vector<vector<real>>& starts,
-                 vector<vector<real>>& ends,
-                 bool hasSubseq) {
-  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
-  genSeqInfo(seqStartPos, subSeqStartPos);
-
-  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
-  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
-}
-
-template <typename T>
-void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
-  size_t totalSize{0};
-  for (auto const& items : inVec) totalSize += items.size();
-  outVec.reserve(totalSize);
-
-  for (auto& items : inVec)
-    move(items.begin(), items.end(), back_inserter(outVec));
-}
-
-void testSeqSliceLayer(bool hasSubseq,
-                       bool useGpu,
-                       vector<int>& seqStartPos,
-                       vector<int>& subSeqStartPos,
-                       vector<vector<real>>& starts,
-                       vector<vector<real>>& ends) {
-  // layer size is not crutial for this layer,
-  // so here use a small layer size in the unittest.
-  const size_t layerSize{4};
-  TestConfig config;
-  config.layerConfig.set_type("seq_slice");
-  config.layerConfig.set_size(layerSize);
-
-  // add the first input
-  MatrixPtr seqInputPtr =
-      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
-                     layerSize,
-                     false,
-                     false);
-  seqInputPtr->randomizeUniform();
-
-  if (hasSubseq) {
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                "seq_input",
-                                seqInputPtr,
-                                seqStartPos,
-                                subSeqStartPos});
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
-  }
-  config.layerConfig.add_inputs();
-
-  // add start indices
-  if (starts.size()) {
-    vector<real> startsToVec;
-    flatten2dVector(starts, startsToVec);
-
-    MatrixPtr startMatrixPtr =
-        Matrix::create(starts.size(), starts[0].size(), false, false);
-    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
-
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(true);
-  }
-
-  // add end indices
-  if (ends.size()) {
-    vector<real> endsToVec;
-    flatten2dVector(ends, endsToVec);
-
-    MatrixPtr endMatrixPtr =
-        Matrix::create(ends.size(), ends[0].size(), false, false);
-    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
-
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(false);
-  }
-
-  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
-}
-
-TEST(Layer, SeqSliceLayer) {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<vector<real>> starts;
-  vector<vector<real>> ends;
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-  genSeqInfo(seqStartPos, subSeqStartPos);
-  for (bool hasSubseq : {true, false}) {
-    LOG(INFO) << "hasSubSeq : " << hasSubseq;
-    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
-    for (bool useGpu : mode) {
-      vector<vector<real>> tmp;
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/gserver/tests/test_Upsample.cpp b/paddle/legacy/gserver/tests/test_Upsample.cpp
deleted file mode 100644
index 940d46baf..000000000
--- a/paddle/legacy/gserver/tests/test_Upsample.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/testing/TestUtil.h"
-
-void setPoolConfig(paddle::TestConfig* config,
-                   paddle::PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(1);
-
-  int kw = 2, kh = 2;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(2);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow =
-      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh =
-      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
-                                   const string& poolType,
-                                   bool use_gpu,
-                                   real* tempGradData) {
-  /* prepare maxPoolWithMaskLayer */
-  paddle::TestConfig config;
-  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
-  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
-  paddle::PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(8);
-  pool->set_img_size_y(8);
-  setPoolConfig(&config, pool, "max-pool-with-mask");
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  config.layerConfig.set_name("MaxPoolWithMask");
-
-  std::vector<paddle::DataLayerPtr> dataLayers;
-  paddle::LayerMap layerMap;
-  vector<paddle::Argument> datas;
-
-  initDataLayer(config,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "MaxPoolWithMask",
-                1,
-                false,
-                use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
-
-  FLAGS_use_gpu = use_gpu;
-  std::vector<paddle::ParameterPtr> parameters;
-  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
-  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
-
-  /* prepare the upsample layer */
-  paddle::LayerConfig upsampleLayerConfig;
-  upsampleLayerConfig.set_type("upsample");
-  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
-  upsampleLayerConfig.add_inputs();
-
-  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
-  upsampleConfig->set_scale(2);
-  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
-  imageConfig->set_channels(2);
-  imageConfig->set_img_size(4);
-  imageConfig->set_img_size_y(4);
-  upsampleLayerConfig.set_size(2 * 8 * 8);
-  upsampleLayerConfig.set_name("upsample");
-
-  for (size_t i = 0; i < 2; i++) {
-    paddle::LayerInputConfig& inputTemp =
-        *(upsampleLayerConfig.mutable_inputs(i));
-    inputTemp.set_input_layer_name("MaxPoolWithMask");
-  }
-
-  paddle::LayerPtr upsampleLayer;
-  paddle::ParameterMap parameterMap;
-  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
-  layerMap[upsampleLayerConfig.name()] = upsampleLayer;
-  upsampleLayer->init(layerMap, parameterMap);
-  upsampleLayer->setNeedGradient(true);
-  upsampleLayer->forward(paddle::PASS_GC);
-  upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
-  upsampleLayer->backward();
-
-  return upsampleLayer;
-}
-
-TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
-  bool useGpu = false;
-  paddle::MatrixPtr inputMat;
-  paddle::MatrixPtr inputGPUMat;
-  paddle::MatrixPtr tempGradMat;
-
-  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
-  inputMat->randomizeUniform();
-
-  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
-  tempGradMat->randomizeUniform();
-  real* tempGradData = tempGradMat->getData();
-
-  paddle::LayerPtr upsampleLayerCPU =
-      doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
-
-#ifdef PADDLE_WITH_CUDA
-  useGpu = true;
-  real* data = inputMat->getData();
-  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
-  inputGPUMat->copyFrom(data, 128);
-  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
-      inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
-  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
-                           upsampleLayerGPU->getOutput("").value);
-
-  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
-                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
-#endif
-}
diff --git a/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
deleted file mode 100644
index b1697e161..000000000
--- a/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/Version.h>
-#include "ModelConfig.pb.h"
-#include "paddle/legacy/gserver/layers/CTCLayer.h"
-#include "paddle/legacy/gserver/layers/DataLayer.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-#include "paddle/legacy/gserver/layers/WarpCTCLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-
-const real* getData(const Matrix& matrix) {
-  if (matrix.useGpu()) {
-    MatrixPtr cpuMatrix = Matrix::create(
-        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
-    cpuMatrix->copyFrom(matrix);
-    return cpuMatrix->getData();
-  } else {
-    return matrix.getData();
-  }
-}
-
-int checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
-  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
-  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-
-  const real* data1 = getData(matrix1);
-  const real* data2 = getData(matrix2);
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-  return count;
-}
-
-void initArgument(size_t batchSize,
-                  int layerSize,
-                  bool useGpu,
-                  Argument& data) {
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-}
-
-LayerPtr createDataLayer(
-    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createLabelLayer(string name,
-                          size_t batchSize,
-                          size_t numClasses,
-                          bool useGpu) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(1);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  Argument data;
-  data.ids = IVector::create(batchSize, useGpu);
-  data.ids->rand(numClasses - 1);
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  labelLayer->setData(data);
-  labelLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createCTCLayer(string name,
-                        size_t numClasses,
-                        bool useGpu,
-                        bool normByTimes,
-                        LayerPtr dataLayer,
-                        LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
-
-  softmaxActivation->forward(dataLayer->getOutput()).check();
-  layer->forward(PASS_GC);
-
-  layer->backward();
-  softmaxActivation->backward(dataLayer->getOutput()).check();
-
-  return layer;
-}
-
-LayerPtr createWarpCTCLayer(string name,
-                            size_t numClasses,
-                            bool useGpu,
-                            bool normByTimes,
-                            LayerPtr dataLayer,
-                            LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("warp_ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_blank(numClasses - 1);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  layer->forward(PASS_GC);
-  layer->backward();
-
-  return layer;
-}
-
-TEST(Layer, WarpCTCLayer) {
-  for (auto layerSize : {10, 64}) {
-    for (auto batchSize : {1, 10, 32}) {
-      for (auto normByTimes : {false, true}) {
-        for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-          if (useGpu) continue;
-#endif
-          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
-
-          FLAGS_use_gpu = useGpu;
-
-          Argument data0;
-          initArgument(batchSize, layerSize, useGpu, data0);
-
-          Argument data1;
-          data1.resizeAndCopyFrom(data0);
-
-          LayerPtr dataLayer0 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data0);
-          LayerPtr dataLayer1 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data1);
-
-          LayerPtr labelLayer =
-              createLabelLayer("label", batchSize, layerSize, useGpu);
-
-          LayerPtr warpctcLayer = createWarpCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
-          LayerPtr ctcLayer = createCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
-
-          /// Check cost
-          LOG(INFO) << "Check cost: "
-                    << checkError(*(warpctcLayer->getOutput().value),
-                                  *(ctcLayer->getOutput().value))
-                    << " different elements.";
-
-          /// Check gradients
-          LOG(INFO) << "Check gradients: "
-                    << checkError(*(dataLayer0->getOutput().grad),
-                                  *(dataLayer1->getOutput().grad))
-                    << " different elements";
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/math/Allocator.h b/paddle/legacy/math/Allocator.h
deleted file mode 100644
index ffb5ec1ca..000000000
--- a/paddle/legacy/math/Allocator.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>
-#include <mutex>
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * @brief Allocator base class.
- *
- * This is the base class of all Allocator class.
- */
-class Allocator {
- public:
-  virtual ~Allocator() {}
-  virtual void* alloc(size_t size) = 0;
-  virtual void free(void* ptr) = 0;
-  virtual std::string getName() = 0;
-};
-
-/**
- * @brief CPU allocator implementation.
- */
-class CpuAllocator : public Allocator {
- public:
-  ~CpuAllocator() {}
-
-  /**
-   * @brief Aligned allocation on CPU.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr;
-#ifdef PADDLE_WITH_MKLDNN
-    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
-    // memory alignment
-    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
-#else
-    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
-#endif
-    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-    return ptr;
-  }
-
-  /**
-   * @brief Free the memory space.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      ::free(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "cpu_alloc"; }
-};
-
-/**
- * @brief GPU allocator implementation.
- */
-class GpuAllocator : public Allocator {
- public:
-  ~GpuAllocator() {}
-
-  /**
-   * @brief Allocate GPU memory.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr = hl_malloc_device(size);
-    CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes";
-    return ptr;
-  }
-
-  /**
-   * @brief Free the GPU memory.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      hl_free_mem_device(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "gpu_alloc"; }
-};
-
-/**
- * @brief CPU pinned memory allocator implementation.
- */
-class CudaHostAllocator : public Allocator {
- public:
-  ~CudaHostAllocator() {}
-
-  /**
-   * @brief Allocate pinned memory.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr = hl_malloc_host(size);
-    CHECK(ptr) << "Fail to allocate pinned memory " << size << " bytes";
-    return ptr;
-  }
-
-  /**
-   * @brief Free the pinned memory.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      hl_free_mem_host(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "cuda_host_alloc"; }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/BaseMatrix.cu b/paddle/legacy/math/BaseMatrix.cu
deleted file mode 100644
index 7e7cdc57a..000000000
--- a/paddle/legacy/math/BaseMatrix.cu
+++ /dev/null
@@ -1,1953 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/Logging.h>
-#include <string.h>
-#include <cmath>
-#include "BaseMatrix.h"
-#include "MathFunctions.h"
-#include "NEONFunctions.h"
-#include "SIMDFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_base.cuh"
-#include "hl_matrix_ops.cuh"
-
-namespace paddle {
-
-const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op) {
-  MatrixOffset offset(0, 0);
-  applyUnary(op, height_, width_, offset);
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op,
-                               int numRows,
-                               int numCols,
-                               MatrixOffset& offset) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-
-  T* A = data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  if (true == useGpu_) {
-    hl_gpu_apply_unary_op(op, A, dimM, dimN, lda);
-  } else {
-    hl_cpu_apply_unary_op(op, A, dimM, dimN, lda);
-  }
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
-  CHECK(height_ == b.height_ && width_ == b.width_)
-      << "Matrix dimensions are not equal";
-
-  MatrixOffset offset(0, 0, 0, 0);
-  applyBinary(op, b, height_, width_, offset);
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyBinary(
-    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
-  applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
-  return 0;
-}
-
-template <class T>
-template <class Op, class bAsRowVector, class bAsColVector>
-int BaseMatrixT<T>::applyBinary(Op op,
-                                BaseMatrixT& b,
-                                int numRows,
-                                int numCols,
-                                MatrixOffset& offset,
-                                bAsRowVector,
-                                bAsColVector) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  if (!bAsRowVector::value && !bAsColVector::value) {
-    CHECK_LE(dimM + offset.bRow_, b.height_);
-    CHECK_LE(dimN + offset.bCol_, b.width_);
-  } else if (bAsRowVector::value && !bAsColVector::value) {
-    CHECK_LE(dimN + offset.bCol_, b.width_);
-  } else if (!bAsRowVector::value && bAsColVector::value) {
-    CHECK_LE(dimM + offset.bRow_, b.height_);
-  } else {
-  }
-  if (true == useGpu_) {
-    hl_gpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
-        op, A, B, dimM, dimN, lda, ldb);
-  } else {
-    hl_cpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
-        op, A, B, dimM, dimN, lda, ldb);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(height_, c.height_);
-  CHECK_EQ(width_, c.width_);
-
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  applyTernary(op, b, c, height_, width_, offset);
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
-                                 MatrixOffset& offset) {
-  applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
-
-  return 0;
-}
-
-template <class T>
-template <class Op, class cAsRowVector, class cAsColVector>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
-                                 MatrixOffset& offset,
-                                 cAsRowVector,
-                                 cAsColVector) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  CHECK_LE(dimM + offset.bRow_, b.height_);
-  CHECK_LE(dimN + offset.bCol_, b.width_);
-  if (!cAsRowVector::value && !cAsColVector::value) {
-    CHECK_LE(dimM + offset.cRow_, c.height_);
-    CHECK_LE(dimN + offset.cCol_, c.width_);
-  } else if (cAsRowVector::value && !cAsColVector::value) {
-    CHECK_LE(dimN + offset.cCol_, c.width_);
-  } else if (!cAsRowVector::value && cAsColVector::value) {
-    CHECK_LE(dimM + offset.cRow_, c.height_);
-  } else {
-  }
-
-  if (true == useGpu_) {
-    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-        op, A, B, C, dimM, dimN, lda, ldb, ldc);
-  } else {
-    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-        op, A, B, C, dimM, dimN, lda, ldb, ldc);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
-                                    BaseMatrixT& d) {
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(height_, c.height_);
-  CHECK_EQ(width_, c.width_);
-  CHECK_EQ(height_, d.height_);
-  CHECK_EQ(width_, d.width_);
-
-  MatrixOffset offset(0, 0, 0, 0, 0, 0, 0, 0);
-  applyQuaternary(op, b, c, d, height_, width_, offset);
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
-                                    BaseMatrixT& d,
-                                    int numRows,
-                                    int numCols,
-                                    MatrixOffset& offset) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!d.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-  CHECK_EQ(useGpu_, d.useGpu_);
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-  int ldd = d.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  T* D = d.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-  CAL_MATRIX_START_ADDRESS(
-      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  CHECK_LE(dimM + offset.bRow_, b.height_);
-  CHECK_LE(dimN + offset.bCol_, b.width_);
-  CHECK_LE(dimM + offset.cRow_, c.height_);
-  CHECK_LE(dimN + offset.cCol_, c.width_);
-  CHECK_LE(dimM + offset.dRow_, d.height_);
-  CHECK_LE(dimN + offset.dCol_, d.width_);
-  if (true == useGpu_) {
-    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-  } else {
-    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
-          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
-                              aAsColVector) {
-  CHECK_EQ(useGpu_, b.useGpu_);
-
-  int ld = stride_;
-  int ldb = b.stride_;
-
-  T* dst = data_;
-  T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-
-  if (aAsRowVector::value && !aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
-    } else {
-      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
-    }
-  } else if (!aAsRowVector::value && aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
-    } else {
-      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
-          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              BaseMatrixT& c,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
-                              aAsColVector) {
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-
-  int ld = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-
-  T* dst = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-
-  if (aAsRowVector::value && !aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
-    } else {
-      hl_cpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
-    }
-  } else if (!aAsRowVector::value && aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
-    } else {
-      hl_cpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-
-  return 0;
-}
-
-/**
- * @brief   unary operator.
- *
- */
-
-DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
-template <class T>
-void BaseMatrixT<T>::neg() {
-  applyUnary(unary::Neg<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
-template <>
-void BaseMatrixT<real>::exp2() {
-  applyUnary(unary::Exp<real>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
-template <>
-void BaseMatrixT<real>::log2() {
-  if (useGpu_) {
-    applyUnary(unary::Log<real>());
-  } else {
-    vLog(height_ * width_, data_, data_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
-template <>
-void BaseMatrixT<real>::sqrt2() {
-  applyUnary(unary::Sqrt<real>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
-template <class T>
-void BaseMatrixT<T>::square2() {
-  applyUnary(unary::Square<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
-template <class T>
-void BaseMatrixT<T>::reciprocal2() {
-  applyUnary(unary::Reciprocal<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2() {
-  applyUnary(unary::Abs<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
-template <class T>
-void BaseMatrixT<T>::sign2() {
-  applyUnary(unary::Sign<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-template <class T>
-void BaseMatrixT<T>::zero() {
-  applyUnary(unary::Zero<T>());
-}
-
-template <class T>
-void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
-  int numRows = height_;
-  int numCols = numColumns;
-  MatrixOffset offset(columnOffset, 0);
-  applyUnary(unary::Zero<T>(), numRows, numCols, offset);
-}
-
-DEFINE_MATRIX_UNARY_OP(One, a = 1);
-template <class T>
-void BaseMatrixT<T>::one() {
-  applyUnary(unary::One<T>());
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
-template <>
-void BaseMatrixT<real>::pow2(real p) {
-  if (useGpu_) {
-    applyUnary(unary::Pow<real>(p));
-  } else {
-    vPow(height_ * width_, data_, p, data_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
-template <class T>
-void BaseMatrixT<T>::subScalar(T p) {
-  applyUnary(unary::SubScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
-template <class T>
-void BaseMatrixT<T>::mulScalar(T p) {
-  applyUnary(unary::MulScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
-template <class T>
-void BaseMatrixT<T>::divScalar(T p) {
-  applyUnary(unary::DivScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
-template <class T>
-void BaseMatrixT<T>::assign(T p) {
-  applyUnary(unary::Assign<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
-template <class T>
-void BaseMatrixT<T>::add(T p) {
-  applyUnary(unary::Add<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
-template <class T>
-void BaseMatrixT<T>::add(T p1, T p2) {
-  applyUnary(unary::Add2<T>(p1, p2));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
-                                 TWO_PARAMETER,
-                                 a = a < p1 ? p1 : (a > p2 ? p2 : a));
-template <class T>
-void BaseMatrixT<T>::clip(T p1, T p2) {
-  applyUnary(unary::Clip<T>(p1, p2));
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
-                                  TWO_PARAMETER,
-                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
-template <class T>
-void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
-                                 ONE_PARAMETER,
-                                 a = a > p ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThanScalar(T p) {
-  applyUnary(unary::BiggerThanScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
-template <class T>
-void BaseMatrixT<T>::downClip(T p) {
-  applyUnary(unary::DownClip<T>(p));
-}
-
-/**
- * @brief   binary operator.
- *
- */
-
-DEFINE_MATRIX_BINARY_OP(Add, a += b);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b) {
-  applyBinary(binary::Add<T>(), b);
-}
-
-template <>
-void BaseMatrixT<real>::add(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Add<real>(), b);
-  } else {  // cpu branch
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(width_, b.width_);
-    vAdd(height_ * width_, data_, b.data_, data_);
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
-  if (columnOffset + b.width_ <= width_) {
-    int numRows = height_;
-    int numCols = b.width_;
-    MatrixOffset offset(columnOffset, 0, 0, 0);
-    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
-  } else if (columnOffset + width_ <= b.width_) {
-    int numRows = height_;
-    int numCols = width_;
-    MatrixOffset offset(0, 0, columnOffset, 0);
-    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
-  } else {
-    LOG(FATAL) << "Wrong argument "
-               << " a.width=" << width_ << " b.width=" << b.width_
-               << " columnOffset=" << columnOffset;
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
-  T* A = data_;
-  T* B = b.data_;
-  int dimM = height_;
-  int dimN = width_;
-
-  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
-      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
-}
-
-template <class T>
-void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
-  applyBinary(binary::Add1<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
-template <>
-void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
-  if (useGpu_) {
-    applyBinary(binary::Pow<real>(p), b);
-  } else {
-    vPow(height_ * width_, b.data_, p, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::Add2<T>(p1, p2), b);
-}
-
-template <class T>
-void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add1<T>(scale),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b) {
-  applyBinary(binary::Sub<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
-  applyBinary(binary::Sub1<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
-template <class T>
-void BaseMatrixT<T>::relu(BaseMatrixT& b) {
-  applyBinary(binary::Relu<T>(), b);
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <>
-void BaseMatrixT<float>::relu(BaseMatrixT& b) {
-  neon::relu(data_, b.data_, height_ * width_);
-}
-#endif
-
-DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
-template <class T>
-void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ReluDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
-                        b = log(1.0 + exp((a > THRESHOLD)
-                                              ? THRESHOLD
-                                              : ((a < -THRESHOLD) ? (-THRESHOLD)
-                                                                  : a))));
-template <>
-void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
-  applyBinary(binary::Softrelu<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(
-    SoftreluDerivative, const T THRESHOLD = 40.0;
-    a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
-                                ? THRESHOLD
-                                : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-template <>
-void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SoftreluDerivative<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
-                                  b = b < p2 ? b : p2);
-template <class T>
-void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
-  applyBinary(binary::Brelu<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
-                                  TWO_PARAMETER,
-                                  a *= (b > p1 && b < p2) ? 1.0 : 0.0);
-template <class T>
-void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;
-  applyBinary(binary::BreluDerivative<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
-template <class T>
-void BaseMatrixT<T>::square2(BaseMatrixT& b) {
-  applyBinary(binary::Square<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
-template <class T>
-void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SquareDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <>
-void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
-  applyBinary(binary::Tanh<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
-template <class T>
-void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
-  applyBinary(binary::TanhDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(
-    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
-template <>
-void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
-  applyBinary(binary::ScaledTanh<real>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
-                                  TWO_PARAMETER,
-                                  a *= p2 * (p1 - b * b));
-template <class T>
-void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
-template <class T>
-void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
-  applyBinary(binary::Reciprocal<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
-template <class T>
-void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ReciprocalDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
-  applyBinary(binary::Abs<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
-template <class T>
-void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
-  applyBinary(binary::AbsDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
-                        const T THRESHOLD_MAX = 13.0;
-                        T tmp = (a < THRESHOLD_MIN)
-                                    ? THRESHOLD_MIN
-                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-                        b = 1.0f / (1.0f + exp(-tmp)));
-template <>
-void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Sigmoid<real>(), b);
-  } else {  // cpu versioni
-    size_t numSamples = this->height_;
-    size_t dim = this->width_;
-    CHECK_EQ(b.height_, numSamples);
-    CHECK_EQ(b.width_, dim);
-    const real* in = this->data_;
-    real* out = b.data_;
-
-    // out = - in
-    const float THRESHOLD_MIN = -40.0;  // make sure sigmoid(x) > 0
-    const float THRESHOLD_MAX = 13.0;   // make sure sigmoid(x) < 1
-    for (size_t i = 0; i < numSamples * dim; ++i) {
-      real tmp = in[i];
-      tmp = (tmp < THRESHOLD_MIN)
-                ? THRESHOLD_MIN
-                : ((tmp > THRESHOLD_MAX) ? THRESHOLD_MAX : tmp);
-      out[i] = -tmp;
-    }
-
-    // out = exp(out)
-    vExp(numSamples * dim, out, out);
-
-    // out = 1 / (1 + out)
-    for (size_t i = 0; i < numSamples * dim; ++i) {
-      out[i] = 1 / (1 + out[i]);
-    }
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
-template <class T>
-void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SigmoidDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
-template <class T>
-void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ExpDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
-template <class T>
-void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
-  applyBinary(binary::Sign<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
-template <>
-void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
-  applyBinary(binary::Exp<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
-template <>
-void BaseMatrixT<real>::log2(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Log<real>(), b);
-  } else {
-    vLog(height_ * width_, b.data_, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
-template <>
-void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
-  applyBinary(binary::Sqrt<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
-template <>
-void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::InvSqrt<real>(), b);
-  } else {  // cpu branch
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(width_, b.width_);
-    vInvSqrt(height_ * width_, b.data_, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
-template <class T>
-void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
-  applyBinary(binary::IsEqual<T>(value), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
-template <class T>
-void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::AddScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
-template <class T>
-void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::SubScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
-template <class T>
-void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::MulScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
-template <class T>
-void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::DivScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
-template <class T>
-void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
-  applyBinary(binary::ScalarDiv<T>(p), b);
-}
-
-/**
- * @brief   ternary operator.
- *
- */
-
-DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
-                         a = -c * log(b) - (1 - c) * log(1 - b));
-template <>
-void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
-template <class T>
-void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
-                         a = c > 0.5 ? -log(b) : -log(1.0 - b));
-template <>
-void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
-                                                BaseMatrixT& c) {
-  if (useGpu_) {
-    applyTernary(ternary::BinaryCrossEntropy<real>(), b, c);
-  } else {
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(height_, c.height_);
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(width_, c.width_);
-
-    size_t size = height_ * width_;
-    real* out = b.data_;
-    real* label = c.data_;
-    real* cost = data_;
-
-    for (size_t i = 0; i < size; ++i) {
-      cost[i] = label[i] > 0.5 ? out[i] : 1.0 - out[i];
-    }
-    vLog(size, cost, cost);
-    for (size_t i = 0; i < size; ++i) {
-      cost[i] *= -1.0;
-    }
-  }
-}
-
-DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
-                         a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
-template <class T>
-void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Add<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
-  applyTernary(ternary::Add1<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Sub<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
-  applyTernary(ternary::Sub1<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
-template <class T>
-void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Add2<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
-                                   THREE_PARAMETER,
-                                   a = p1 * a + p2 * b + p3 * c);
-template <class T>
-void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
-  applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
-                                   THREE_PARAMETER,
-                                   c = p2 * c - p1 * (b + p3 * a);
-                                   a = a + c);
-template <class T>
-void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
-                               BaseMatrixT& c,  // mom
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
-  applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
-                                      THREE_PARAMETER,
-                                      c = p2 * c - p1 * d * (b + p3 * a);
-                                      a += c);
-template <class T>
-void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
-                               BaseMatrixT& c,  // mom,
-                               BaseMatrixT& d,  // lr,
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
-  applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
-                                  a = (a > lambda)
-                                          ? (a - lambda)
-                                          : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
-void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
-  applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
-}
-
-template <>
-void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
-                                real learningRate,
-                                real decayRate) {
-  if (useGpu_) {
-    applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
-  } else {
-    simd::decayL1(this->data_,
-                  this->data_,
-                  lr.data_,
-                  learningRate * decayRate,
-                  height_ * width_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
-                                 a = (a > lambda)
-                                         ? (a - lambda)
-                                         : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
-void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
-  applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
-}
-
-template <>
-void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
-  if (useGpu_) {
-    applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
-  } else {
-    simd::decayL1(
-        this->data_, this->data_, learningRate * decayRate, height_ * width_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
-                                  ONE_PARAMETER,
-                                  a *= (1.0f / (1.0f + p * b)));
-template <class T>
-void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
-  if (useGpu_) {
-    applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
-  } else {
-    size_t size = this->height_ * this->width_;
-    T decay = learningRate * decayRate;
-    for (size_t j = 0; j < size; ++j) {
-      this->data_[j] *= 1.0f / (1.0f + decay * lr.data_[j]);
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
-  BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
-}
-
-DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
-template <class T>
-void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
-  applyBinary(binary::DotMul<T>(), b);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
-template <class T>
-void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotMul<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
-template <class T>
-void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotDiv<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
-                                   TWO_PARAMETER,
-                                   a = (b + p1) / (c + p2));
-template <class T>
-void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
-                            a = (a > THRESHOLD)
-                                    ? THRESHOLD
-                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = log(1 + exp(a)) - a * d);
-template <>
-void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 BaseMatrixT& d) {
-  applyQuaternary(quaternary::RankLoss<real>(), b, c, d);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
-                            a = (a > THRESHOLD)
-                                    ? THRESHOLD
-                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = exp(a);
-                            a = (a / (1 + a) - d));
-template <>
-void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
-                                   BaseMatrixT& c,
-                                   BaseMatrixT& d) {
-  applyQuaternary(quaternary::RankLossBp<real>(), b, c, d);
-}
-
-/* this = log(1 + exp(b)) - c * b */
-DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
-                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-                                                                 ? -THRESHOLD
-                                                                 : b;
-                         a = log(1 + exp(x)) - c * x);
-template <>
-void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
-}
-
-/* this = exp(b)/(1+exp(b)) - c */
-DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
-                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-                                                                 ? -THRESHOLD
-                                                                 : b;
-                         x = exp(x);
-                         a = x / (1 + x) - c);
-template <>
-void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
-                                                 BaseMatrixT& c) {
-  applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::BiggerThan<T>(), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(
-    BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
-                                BaseMatrixT& c,
-                                BaseMatrixT& d) {
-  applyQuaternary(quaternary::BiggerThan<T>(), b, c, d);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
-template <class T>
-void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Max<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
-                                   ONE_PARAMETER,
-                                   c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
-template <class T>
-void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
-                                                BaseMatrixT& b,
-                                                BaseMatrixT& c,
-                                                T p) {
-  CHECK(!useGpu_) << "do not support gpu";
-  MatrixOffset offset(0, 0, 0, 0, destCol, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  b.applyTernary(ternary::BinaryClassificationError<T>(p),
-                 c,
-                 *this,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
-}
-
-template <>
-void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
-                                                  BaseMatrixT& b,
-                                                  BaseMatrixT& c,
-                                                  real p) {
-  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::classificationError(p),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
-                                      THREE_PARAMETER,
-                                      a = p1 * b + p2 * c + p3 * d);
-template <class T>
-void BaseMatrixT<T>::add3(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
-  applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
-template <class T>
-void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotMulSquare<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
-template <class T>
-void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotSquareSquare<T>(), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
-template <class T>
-void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
-  applyBinary(binary::DotMulSquare<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
-template <class T>
-void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
-  applyBinary(binary::DotSquareMul<T>(), b);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
-                                      THREE_PARAMETER,
-                                      T tmp = p1 * b + p2 * c + p3 * d;
-                                      a += tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::addSquareSum(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
-  applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
-template <class T>
-void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
-  applyBinary(binary::AddSquare<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
-                                  TWO_PARAMETER,
-                                  a = p1 * a + p2 * b * b);
-template <class T>
-void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
-                                   TWO_PARAMETER,
-                                   a = p1 * a + p2 * b * b * c * c);
-template <class T>
-void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
-                                       BaseMatrixT& c,
-                                       T p1,
-                                       T p2) {
-  applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
-                                   THREE_PARAMETER,
-                                   a = 1 / (p1 * b + p2 * c + p3));
-template <class T>
-void BaseMatrixT<T>::reciprocalSum(
-    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
-  applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
-                                  TWO_PARAMETER,
-                                  a = 1 / (p1 * b + p2));
-template <class T>
-void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
-                                   TWO_PARAMETER,
-                                   T tmp = p1 * b + p2 * c;
-                                   a *= tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
-                                     BaseMatrixT& c,
-                                     T p1,
-                                     T p2) {
-  applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
-                                   TWO_PARAMETER,
-                                   T tmp = p1 * b + p2 * c;
-                                   a = tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
-                                   TWO_PARAMETER,
-                                   a *= p1 * b + p2 * c);
-template <class T>
-void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
-template <class T>
-void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
-  applyBinary(binary::CopyAndClear<T>(), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
-                                   TWO_PARAMETER,
-                                   a = p1 * a + p2 * b * c);
-template <class T>
-void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
-template <class T>
-void BaseMatrixT<T>::assign(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Assign<T>(), b);
-  } else {  // cpu version
-    CHECK_EQ(this->height_, b.height_);
-    CHECK_EQ(this->width_, b.width_);
-    memcpy(data_, b.data_, sizeof(T) * height_ * width_);
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
-  if (columnOffset + b.width_ <= width_) {
-    int numRows = height_;
-    int numCols = b.width_;
-    MatrixOffset offset(columnOffset, 0, 0, 0);
-    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
-  } else if (columnOffset + width_ <= b.width_) {
-    int numRows = height_;
-    int numCols = width_;
-    MatrixOffset offset(0, 0, columnOffset, 0);
-    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
-  } else {
-    LOG(FATAL) << "Wrong argument "
-               << " a.width=" << width_ << " b.width=" << b.width_
-               << " columnOffset=" << columnOffset;
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
-template <class T>
-void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
-  applyBinary(binary::DeepSwap<T>(), b);
-}
-
-template <>
-void BaseMatrixT<real>::rowDotMul(size_t destCol,
-                                  BaseMatrixT& b,
-                                  BaseMatrixT& c) {
-  int numRows = b.height_;
-  int numCols = b.width_;
-  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-}
-
-template <class T>
-void BaseMatrixT<T>::rowDotMul2(size_t destCol,
-                                BaseMatrixT& b,
-                                BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  size_t height = this->height_;
-  CHECK_LT(destCol, this->width_);
-  CHECK_EQ(height, b.height_);
-  CHECK_EQ(height, c.height_);
-  CHECK_EQ(b.width_, c.width_);
-  size_t width = b.width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height;
-       ++i, A += this->width_, B += width, C += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[destCol] += B[j] * C[j];
-    }
-  }
-}
-
-template <>
-void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  CHECK_EQ(height_, 1LU);
-  CHECK_EQ(b.height_, c.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(width_, c.width_);
-  size_t height = b.height_;
-  size_t width = b.width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, B += width, C += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] += B[j] * C[j];
-    }
-  }
-}
-
-DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
-template <class T>
-void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /*cAsRowVector*/,
-               false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  CHECK_EQ(c.height_, 1LU);
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(width_, c.width_);
-  size_t height = height_;
-  size_t width = width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, A += width, B += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] += B[j] * C[j];
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-template <class T>
-void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  size_t height = this->height_;
-  size_t width = this->width_;
-  CHECK_EQ(height, b.height_);
-  CHECK_EQ(width, b.width_);
-  CHECK_LT(cCol, c.width_);
-  CHECK_EQ(height, c.height_);
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] = B[j] * C[cCol];
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
-template <class T>
-void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::RowAdd<T>(p),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
-template <>
-void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  if (useGpu_) {
-    MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-    int numRows = height_;
-    int numCols = width_;
-    applyTernary(ternary::RowPow<real>(),
-                 b,
-                 c,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
-  } else {
-    size_t height = this->height_;
-    size_t width = this->width_;
-    CHECK_EQ(height, b.height_);
-    CHECK_EQ(width, b.width_);
-    CHECK_LT(cCol, c.width_);
-    CHECK_EQ(height, c.height_);
-    real* A = this->data_;
-    const real* B = b.data_;
-    const real* C = c.data_;
-    for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
-      vPow(width, B, C[cCol], A);
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
-template <class T>
-void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-
-  return 0;
-}
-
-template <>
-template <class Agg, class Saver>
-int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
-  if (scaleDest != 0) {
-    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
-  } else {
-    applyRow(agg, base::binary::second(), b);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-template <class Agg, class Op, class Saver>
-int BaseMatrixT<real>::applyRow(
-    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  CHECK_EQ(c.height_, numRows);
-  CHECK_EQ(c.width_, numCols);
-  aggregate(agg,
-            op,
-            sv,
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-  return 0;
-}
-
-template <>
-template <class Agg, class Op>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                Op op,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b,
-                                BaseMatrixT& c) {
-  if (scaleDest != 0) {
-    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
-  } else {
-    applyRow(agg, op, base::binary::second(), b, c);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(width_, numCols);
-  CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-
-  return 0;
-}
-
-template <>
-template <class Agg, class Saver>
-int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(width_, numCols);
-  CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyCol(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
-  if (scaleDest != 0) {
-    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
-  } else {
-    applyCol(agg, base::binary::second(), b);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
-}
-
-template <>
-void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
-  applyRow(aggregate::max(), b);
-}
-
-template <>
-void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
-  applyRow(aggregate::min(), b);
-}
-
-template <>
-void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
-  applyCol(aggregate::max(), b);
-}
-
-template <>
-void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
-  applyCol(aggregate::min(), b);
-}
-
-template <>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
-}
-
-template <>
-void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
-                                          BaseMatrixT& c,
-                                          real scaleSum,
-                                          real scaleDest) {
-  applyRow(
-      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
-}
-
-template <>
-void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
-                                      BaseMatrixT& c,
-                                      real scaleSum,
-                                      real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
-}
-
-template class BaseMatrixT<real>;
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-template class BaseMatrixT<int>;
-
-#else
-
-template <>
-void BaseMatrixT<int>::zero() {
-  applyUnary(unary::Zero<int>());
-}
-
-template <>
-void BaseMatrixT<int>::assign(int p) {
-  applyUnary(unary::Assign<int>(p));
-}
-
-template <>
-void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
-  applyBinary(binary::IsEqual<int>(value), b);
-}
-
-template <>
-void BaseMatrixT<int>::neg() {
-  applyUnary(unary::Neg<int>());
-}
-
-template <>
-void BaseMatrixT<int>::abs2() {
-  applyUnary(unary::Abs<int>());
-}
-
-template <>
-void BaseMatrixT<int>::add(int p) {
-  applyUnary(unary::Add<int>(p));
-}
-
-template <>
-void BaseMatrixT<int>::add(int p1, int p2) {
-  applyUnary(unary::Add2<int>(p1, p2));
-}
-
-template <>
-void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
-  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
-}
-
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/math/BaseMatrix.h b/paddle/legacy/math/BaseMatrix.h
deleted file mode 100644
index 4627f847d..000000000
--- a/paddle/legacy/math/BaseMatrix.h
+++ /dev/null
@@ -1,1095 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-#include <cstddef>
-#include "TensorExpression.h"
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-/*
- * nvcc currently does not support C++11,
- * so I realized false_type and true_type.
- */
-template <class T, T v>
-struct bool_constant {
-  static const T value = v;
-};
-typedef bool_constant<bool, false> false_type;
-typedef bool_constant<bool, true> true_type;
-
-/**
- * @brief   Calculate matrix element address.
- *
- * For instance, address of A[i][j] = i * ld + j.
- *
- */
-#define CAL_MATRIX_START_ADDRESS(address, height, width, ld, col, row) \
-  CHECK_LE(col, width);                                                \
-  CHECK_LE(row, height);                                               \
-  address += row * ld + col;
-
-class MatrixOffset {
- public:
-  size_t aCol_;
-  size_t aRow_;
-  size_t bCol_;
-  size_t bRow_;
-  size_t cCol_;
-  size_t cRow_;
-  size_t dCol_;
-  size_t dRow_;
-  MatrixOffset(size_t aCol = 0,
-               size_t aRow = 0,
-               size_t bCol = 0,
-               size_t bRow = 0,
-               size_t cCol = 0,
-               size_t cRow = 0,
-               size_t dCol = 0,
-               size_t dRow = 0)
-      : aCol_(aCol),
-        aRow_(aRow),
-        bCol_(bCol),
-        bRow_(bRow),
-        cCol_(cCol),
-        cRow_(cRow),
-        dCol_(dCol),
-        dRow_(dRow) {}
-};
-
-template <class T>
-class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
- public:
-  size_t height_, width_;
-  size_t stride_;
-  T* data_;
-  bool trans_;
-  bool useGpu_;
-
- public:
-  virtual ~BaseMatrixT() {}
-  BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
-      : height_(height),
-        width_(width),
-        stride_(width),
-        data_(data),
-        trans_(trans),
-        useGpu_(useGpu) {}
-
-  /**
-   * @note This constructor is for temporarily making a matrix with different
-   *       useGpu flag as the original matrix so that mixed gpu/cpu operations
-   *       can be performed successfully.
-   */
-  BaseMatrixT(BaseMatrixT& mat, bool useGpu)
-      : height_(mat.height_),
-        width_(mat.width_),
-        stride_(mat.stride_),
-        data_(mat.data_),
-        trans_(mat.trans_),
-        useGpu_(useGpu) {}
-
-  BaseMatrixT(size_t height,
-              size_t width,
-              size_t stride,
-              T* data,
-              bool trans,
-              bool use_gpu)
-      : height_(height),
-        width_(width),
-        stride_(stride),
-        data_(data),
-        trans_(trans),
-        useGpu_(use_gpu) {
-    /* CHECK_LE(width_, stride_); */
-  }
-
-  /// caller should make sure that the size of data is at least height*width
-  void setData(T* data) { data_ = data; }
-
-  /**
-   * unary operator: element wise op(a).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   * @endcode
-   */
-  template <class Op>
-  int applyUnary(Op op);
-
-  /**
-   * unary operator: element wise op(a).
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *  A = this->data_ + offset.aRow_*ld + offset.aCol_;
-   * @endcode
-   */
-  template <class Op>
-  int applyUnary(Op op, int numRows, int numCols, MatrixOffset& offset);
-
-  /**
-   * binary operator: element wise op(a, b).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   * While this->height_ == b.height_ && this->width_ == b.width_.
-   * @endcode
-   */
-  template <class Op>
-  int applyBinary(Op op, BaseMatrixT& b);
-
-  /**
-   * binary operator: element wise op(a, b)
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *   A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *   B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *
-   * if (bAsRowVector == false_type && bAsColVector == false_type)
-   *   op(A[i * lda + j], B[i * ldb + j])
-   *
-   * if (bAsRowVector == true_type && bAsColVector == false_type)
-   *   op(A[i * lda + j], B[j])
-   *
-   * if (bAsRowVector == false_type && bAsColVector == true_type)
-   *   op(A[i * lda + j], B[i * ldb])
-   *
-   * if (bAsRowVector == true_type && bAsColVector == true_type)
-   *   op(A[i * lda + j], B[0])
-   * @endcode
-   */
-  template <class Op, class bAsRowVector, class bAsColVector>
-  int applyBinary(Op op,
-                  BaseMatrixT& b,
-                  int numRows,
-                  int numCols,
-                  MatrixOffset& offset,
-                  bAsRowVector,
-                  bAsColVector);
-
-  template <class Op>
-  int applyBinary(
-      Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset);
-
-  /**
-   * ternary operator: element wise op(a, b, c).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   *
-   * While this->height_ == b.height_ && this->width_ == b.width_
-   *    && this->height_ == c.height_ && this->width_ == c.width_
-   * @endcode
-   */
-  template <class Op>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * ternary operator: element wise op(a, b, c).
-   *
-   * @code
-   *  for 0 <= i < numRows & for 0 <= j < numCols.
-   *  While matrix start address is:
-   *
-   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
-   *
-   *    if (cAsRowVector == false_type && cAsColVector == false_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
-   *
-   *    if (cAsRowVector == true_type && cAsColVector == false_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[j])
-   *
-   *    if (cAsRowVector == false_type && cAsColVector == true_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
-   *
-   *    if (cAsRowVector == 1 && cAsColVector == 1)
-   *      op(A[i*lda + j], B[i*ldb + j], C[0])
-   * @endcode
-   */
-  template <class Op, class cAsRowVector, class cAsColVector>
-  int applyTernary(Op op,
-                   BaseMatrixT& b,
-                   BaseMatrixT& c,
-                   int numRows,
-                   int numCols,
-                   MatrixOffset& offset,
-                   cAsRowVector,
-                   cAsColVector);
-
-  template <class Op>
-  int applyTernary(Op op,
-                   BaseMatrixT& b,
-                   BaseMatrixT& c,
-                   int numRows,
-                   int numCols,
-                   MatrixOffset& offset);
-
-  /**
-   * quaternary operator: element wise op(a, b, c, d).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   *
-   * While this->height_ == b.height_ && this->width_ == b.width_
-   *    && this->height_ == c.height_ && this->width_ == c.width_
-   *    && this->height_ == d.height_ && this->width_ == d.width_
-   * @endcode
-   */
-  template <class Op>
-  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * quaternary operator: element wise op(a, b, c, d).
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
-   *    D = d->data_ + offset.dRow_*ldd + offset.dCol_;
-   * @endcode
-   */
-  template <class Op>
-  int applyQuaternary(Op op,
-                      BaseMatrixT& b,
-                      BaseMatrixT& c,
-                      BaseMatrixT& d,
-                      int numRows,
-                      int numCols,
-                      MatrixOffset& offset);
-
-  /**
-   * a aggregate expression that apply each row(or column) of matrix b.
-   * op and sv is element wise operator.
-   *
-   * @code
-   * if (aAsRowVector == true_type && aAsColVector == false_type)
-   *  for each column j & 0 <= i < numRows, do:
-   *    dst = agg(op(b[i*ldb + j]))
-   *    a[j] = sv(a[j], dst)
-   *
-   * if (aAsRowVector == false_type && aAsColVector == true_type)
-   *  for each row i & 0 <= j < numCols, do:
-   *    dst = agg(op(b[i*ldb + j]))
-   *    a[i] = sv(a[i], dst)
-   * @endcode
-   */
-  template <class Agg,
-            class Op,
-            class Saver,
-            class aAsRowVector,
-            class aAsColVector>
-  int aggregate(Agg agg,
-                Op op,
-                Saver sv,
-                BaseMatrixT& b,
-                int numRows,
-                int numCols,
-                MatrixOffset& offset,
-                aAsRowVector,
-                aAsColVector);
-
-  /**
-   * a aggregate expression that apply each row(or column) of matrix b and c.
-   *
-   * op and sv is element wise operator.
-   *
-   * @code
-   * if (aAsRowVector == true_type && aAsColVector == false_type)
-   *   for each column j & 0 <= i < numRows, do:
-   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
-   *     a[j] = sv(a[j], dst)
-   *
-   * if (aAsRowVector == false_type && aAsColVector == true_type)
-   *   for each row i & 0 <= j < numCols, do:
-   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
-   *     a[i] = sv(a[i], dst)
-   * @endcode
-   */
-  template <class Agg,
-            class Op,
-            class Saver,
-            class aAsRowVector,
-            class aAsColVector>
-  int aggregate(Agg agg,
-                Op op,
-                Saver sv,
-                BaseMatrixT& b,
-                BaseMatrixT& c,
-                int numRows,
-                int numCols,
-                MatrixOffset& offset,
-                aAsRowVector,
-                aAsColVector);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   this[i] = agg(b[i*ldb + j])
-   * @endcode
-   */
-  template <class Agg>
-  int applyRow(Agg agg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   dst = agg(op(b[i*ldb + j], c[i*ldc + j])
-   *   this[i] = sv(this[i], dst)
-   * @endcode
-   */
-  template <class Agg, class Op, class Saver>
-  int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg, class Op>
-  int applyRow(Agg agg,
-               Op op,
-               real scaleDest,
-               real scaleAgg,
-               BaseMatrixT& b,
-               BaseMatrixT& c);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   dst = agg(b[i*ldb + j])
-   *   this[i] = sv(this[i], dst)
-   * @endcode
-   */
-  template <class Agg, class Saver>
-  int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg>
-  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each column of matrix b.
-   *
-   * @code
-   * for each column j & 0 <= i < b.height_, do:
-   *   this[j] = agg(b[i*ldb + j])
-   * @endcode
-   */
-  template <class Agg>
-  int applyCol(Agg agg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each column of matrix b.
-   *
-   * @code
-   * for each column j & 0 <= i < b.height_, do:
-   *   dst = agg(b[i*ldb + j])
-   *   this[j] = sv(this[j], dst)
-   * @endcode
-   */
-  template <class Agg, class Saver>
-  int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg>
-  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
-
-  bool useGpu() const { return useGpu_; }
-
-  const T* rowBuf(size_t row) const { return data_ + width_ * row; }
-
-  T* rowBuf(size_t row) { return data_ + width_ * row; }
-
-  /**
-   * @brief   unary operator.
-   *
-   */
-  void neg();
-  void exp2();
-  void pow2(T p);
-  void log2();
-  void sqrt2();
-  void square2();
-  void reciprocal2();
-  void abs2();
-  void sign2();
-  void zero();
-
-  /**
-   * @code
-   * this(row, col + columnOffset) = 0 for 0 <= col < numColumns
-   * @endcode
-   */
-  void zeroAtOffset(int64_t columnOffset, int64_t numColumns);
-  void one();
-  void subScalar(T p);
-  void mulScalar(T p);
-  void divScalar(T p);
-
-  /**
-   * @code
-   * this = p
-   * @endcode
-   */
-  void assign(T p);
-
-  /**
-   * @code
-   * swap(this, b)
-   * example: swap two Matrices
-   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-   * cpuA->deepSwap(*cpuB);
-   * @endcode
-   */
-  void deepSwap(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this + p
-   * @endcode
-   */
-  void add(T p);
-
-  /**
-   * @code
-   * this = this*p1 + p2
-   * @endcode
-   */
-  void add(T p1, T p2);
-
-  /**
-   * this = this < low ? low : this
-   *
-   * this = this > high ? high : this
-   */
-  void clip(T p1, T p2);
-
-  /**
-   * this = b < low ? 0 : 1
-   *
-   * this = b > high ? 0 : 1
-   */
-  void clipDerivative(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * a = a > p ? 1.0f : 0.0f
-   * @endcode
-   */
-  void biggerThanScalar(T p);
-
-  /**
-   * @code
-   * a = a > p ? a : p
-   * @endcode
-   */
-  void downClip(T p);
-
-  /**
-   * @code
-   * this = b
-   * @endcode
-   */
-  void assign(BaseMatrixT& b);
-
-  /**
-   * @code
-   * If b.width + columOffset <= this.width
-   *  this(row, col + columnOffset) = b(row, col) for 0 <= col < b.width
-   *
-   * If this.width + columnOffset <= b.width
-   *  this(row, col) = b(row, col + columnOffset) for 0 <= col < this.width
-   *
-   * Otherwise, FATAL
-   * @endcode
-   */
-  void assignAtOffset(BaseMatrixT& b, int64_t columnOffset);
-
-  /// this = this + b
-  void add(BaseMatrixT& b);
-
-  /**
-   * @code
-   * If b.width + columOffset <= this.width
-   *  this(row, col + columnOffset) += b(row, col) for 0 <= col < b.width
-   *
-   * If this.width + columnOffset <= b.width
-   *  this(row, col) += b(row, col + columnOffset) for 0 <= col < this.width
-   *
-   * Otherwise, FATAL
-   * @endcode
-   */
-  void addAtOffset(BaseMatrixT& b, int64_t columnOffset);
-
-  void addColVector(BaseMatrixT& b);
-  void addRowVector(BaseMatrixT& b);
-  void addBias(BaseMatrixT& b, T scale);
-
-  void mulRowVector(BaseMatrixT& b);
-  void divRowVector(BaseMatrixT& b);
-
-  void mulColVector(BaseMatrixT& b);
-  void divColVector(BaseMatrixT& b);
-
-  void addP2P(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this + b*p
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = p1*this + p2*b
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = this - b
-   * @endcode
-   */
-  void sub(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this - b*p
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * b = max(0, this)
-   * @endcode
-   */
-  void relu(BaseMatrixT& b);
-  void reluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = log(1.0 + exp(this))
-   * @endcode
-   */
-  void softrelu(BaseMatrixT& b);
-  void softreluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = min(max(this, p1), p2)
-   * @endcode
-   */
-  void brelu(BaseMatrixT& b);
-  void breluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = this * this
-   * @endcode
-   */
-  void square2(BaseMatrixT& b);
-  void squareDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = tanh(this)
-   * @endcode
-   */
-  void tanh(BaseMatrixT& b);
-  void tanhDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = p1 * tanh(p2 * this)
-   * @endcode
-   */
-  void scaledTanh(BaseMatrixT& b, T p1, T p2);
-  void scaledTanhDerivative(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * b = 1.0f / this
-   * @endcode
-   */
-  void reciprocal2(BaseMatrixT& b);
-  void reciprocalDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = this > 0.0f ? this : -this
-   * @endcode
-   */
-  void abs2(BaseMatrixT& b);
-  void absDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = 1.0f / (1.0f + exp(-this))
-   * @endcode
-   */
-  void sigmoid(BaseMatrixT& b);
-  void sigmoidDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = a
-   * @endcode
-   */
-  void expDerivative(BaseMatrixT& b);
-
-  void sign2(BaseMatrixT& b);
-
-  void exp2(BaseMatrixT& b);
-  void pow2(BaseMatrixT& b, T p);
-  void log2(BaseMatrixT& b);
-  void sqrt2(BaseMatrixT& b);
-  void addScalar(BaseMatrixT& b, T p);
-  void subScalar(BaseMatrixT& b, T p);
-  void mulScalar(BaseMatrixT& b, T p);
-  void divScalar(BaseMatrixT& b, T p);
-  void scalarDiv(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = 1.0f / sqrt(b)
-   * @endcode
-   */
-  void invSqrt(BaseMatrixT& b);
-
-  /// this = (b == value)
-  void isEqualTo(BaseMatrixT& b, T value);
-
-  /**
-   * @brief   ternary operator.
-   */
-  void softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
-  void softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
-  void binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
-  void binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b + c
-   * @endcode
-   */
-  void add(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = b*p1 + c*p2
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
-  /**
-   * @code
-   * this = b - c
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = b*p1 - c*p2
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
-
-  /**
-   * @code
-   * this = this + b + c
-   * @endcode
-   */
-  void add2(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = this*p1 + b*p2 + c*p3
-   * @endcode
-   */
-  void add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * this = a*p1 + b*p2 + c*p3
-   * @endcode
-   */
-  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3);
-
-  /**
-   * @code
-   *   c = p2 * c - p1 *  (b + p3 * this)
-   *   this += mom
-   * @endcode
-   */
-  void sgdUpdate(BaseMatrixT& b,  //  grad
-                 BaseMatrixT& c,  //  mom
-                 T p1,            //  learningRate,
-                 T p2,            //  momentum,
-                 T p3);           //  decayRate
-
-  /**
-   * @code
-   *   c = p2 * c - p1 * d * (b + p3 * this)
-   *   this += mom
-   * @endcode
-   */
-  void sgdUpdate(BaseMatrixT& b,  // grad,
-                 BaseMatrixT& c,  // mom,
-                 BaseMatrixT& d,  // lr,
-                 T p1,            // learningRate,
-                 T p2,            // momentum,
-                 T p3);           // decayRate
-
-  /// apply L1/L2 to *this*
-  virtual void applyL1(T learningRate, T decayRate);
-  void applyL1(BaseMatrixT& lr, T learningRate, T decayRate);
-  void applyL2(T learningRate, T decayRate);
-  void applyL2(BaseMatrixT& lr, T learningRate, T decayRate);
-
-  /**
-   * @code
-   * this *= b
-   * @endcode
-   */
-  void dotMul(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = b * c
-   * @endcode
-   */
-  void dotMul(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b / c
-   * @endcode
-   */
-  void dotDiv(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = (b + p1) / (c + p2)
-   * @endcode
-   */
-  void dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = log(1 + exp(b - c)) - d * (b - c)
-   * @endcode
-   */
-  void rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-  void rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * @code
-   * this = log(1 + exp(b)) - c * b
-   * @endcode
-   */
-  void logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this += exp(b)/(1+exp(b)) - c
-   * @endcode
-   */
-  void logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b > c ? 1.0 : 0.0
-   * @endcode
-   */
-  void biggerThan(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = ((b>c && d>0.5) || (b<c && d<0.5)) ? 1 : 0)
-   * @endcode
-   */
-  void biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * @code
-   * this = b>c ? b : c
-   * @endcode
-   */
-  void max2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
-   * @endcode
-   */
-  void binaryClassificationError(size_t destCol,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 T p);
-  void binaryClassificationError2(size_t destCol,
-                                  BaseMatrixT& b,
-                                  BaseMatrixT& c,
-                                  T p);
-
-  /**
-   * @code
-   * this = this * b * b
-   * @endcode
-   */
-  void dotMulSquare(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this * this * b
-   * @endcode
-   */
-  void dotSquareMul(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = b * c * c
-   * @endcode
-   */
-  void dotMulSquare(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b * b * c * c
-   * @endcode
-   */
-  void dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = this * (p1*b + p2*c)^2
-   * @endcode
-   */
-  void dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = (p1*b + p2*c)^2
-   * @endcode
-   */
-  void dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this=  this * (p1*b + p2*c)
-   * @endcode
-   */
-  void dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this += sqr(p1*b + p2*c + p3*d)
-   * @endcode
-   */
-  void addSquareSum(
-      BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * this += p * sqr(b)
-   * @endcode
-   */
-  void addSquare(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * sqr(b)
-   * @endcode
-   */
-  void decayAddSquare(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * sqr(b * c)
-   * @endcode
-   */
-  void decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = 1 / (p1 * b + p2)
-   * @endcode
-   */
-  void reciprocal2(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = 1 / (p1 * b + p2 * c + p3)
-   * @endcode
-   */
-  void reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * b = this; this = 0
-   * @endcode
-   */
-  void copyAndClear(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this_row[destCol] += dotprod(b_row, c_row)
-   * @endcode
-   */
-  void rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
-  void rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * this is vector (one row matrix)
-   *
-   * @code
-   *   for each row i, do:
-   *      this_row += dotmul(b_row_i, c_row_i)
-   * @endcode
-   */
-  void addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c);
-  void addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * c is vector (one row matrix)
-   *
-   * @code
-   * for each row i, do:
-   *    this_row_i += dotmul(b_row_i, c_row)
-   * @endcode
-   */
-  void addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c);
-  void addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * b * c
-   * @endcode
-   */
-  void addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this_row = b_row * c_row[cCol]
-   * @endcode
-   */
-  void rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-  void rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_col = b_col * c_col[cRow]
-   * @endcode
-   */
-  void colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_col += b_col * c_col[cRow]
-   * @endcode
-   */
-  void addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_row += b_row * c_row[cCol]
-   * @endcode
-   */
-  void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /// calculate the sum of each row of the matrix b.
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij}
-  void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest);
-
-  /// calculate the maximum value of each row of the matrix b.
-  void maxRows(BaseMatrixT& b);
-  /// calculate the minimum value of each row of the matrix b.
-  void minRows(BaseMatrixT& b);
-
-  /// calculate the maximum value of each column of the matrix b.
-  void maxCols(BaseMatrixT& b);
-  /// calculate the minimum value of each column of the matrix b.
-  void minCols(BaseMatrixT& b);
-
-  /// calculate the sum of each column of the matrix b.
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
-  void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
-
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
-  void sumOfSquaredDiffs(BaseMatrixT& b,
-                         BaseMatrixT& c,
-                         T scaleSum,
-                         T scaleDest);
-
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
-  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest);
-
-  /**
-   * @code
-   * this_row = b_row + p * ones * c_row[cCol]
-   * @endcode
-   */
-  void rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p);
-  /**
-   * @code
-   * this_row = pow(b_row, c_row[cCol])
-   * @endcode
-   */
-  void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  virtual bool isSparse() const { return false; }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<T>(*this, expr);
-    } else {
-      TensorCpuApply<T>(*this, expr);
-    }
-  }
-
-  template <typename ExpressionType>
-  void operator+=(const ExpressionType& expr) {
-    (*this) = (*this) + expr;
-  }
-  template <typename ExpressionType>
-  void operator-=(const ExpressionType& expr) {
-    (*this) = (*this) - expr;
-  }
-  template <typename ExpressionType>
-  void operator*=(const ExpressionType& expr) {
-    (*this) = (*this) * expr;
-  }
-  template <typename ExpressionType>
-  void operator/=(const ExpressionType& expr) {
-    (*this) = (*this) / expr;
-  }
-};
-
-typedef BaseMatrixT<real> BaseMatrix;
-typedef BaseMatrixT<int> IBaseMatrix;
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/CMakeLists.txt b/paddle/legacy/math/CMakeLists.txt
deleted file mode 100644
index 9992ec71f..000000000
--- a/paddle/legacy/math/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-# common package contains:
-#   * the utilities:
-#       * Thread Libs
-#       * Memory Manage libs
-#       * CommandLine Parser
-#       * Logging
-#       * Timer/Stats
-#   * the math libraries:
-#       * Matrix/Vector
-#   * the parameter optimizers.
-#   * the parameter updater functions.
-#
-# TODO(yuyang18): separate libs.
-#
-file(GLOB MATH_HEADERS . *.h)
-file(GLOB MATH_SOURCES . *.cpp)
-
-if(NOT WITH_MKLDNN)
-    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
-    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
-    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
-    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
-    message(STATUS "Skip compiling with MKLDNNMatrix")
-else()
-    message(STATUS "Compile with MKLDNNMatrix")
-endif()
-
-if(MOBILE_INFERENCE)
-    # Remove sparse
-    list(REMOVE_ITEM MATH_HEADERS
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
-    list(REMOVE_ITEM MATH_SOURCES
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
-endif()
-set(MATH_SOURCES
-    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu"
-    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu"
-    ${MATH_SOURCES})
-if(NOT WITH_GPU)
-    # then compile BaseMatrix.cu as c++ file
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu")
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu")
-    add_library(paddle_math STATIC
-        ${MATH_SOURCES})
-else()
-    cuda_add_library(paddle_math ${MATH_SOURCES})
-endif()
-
-
-add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/math/CpuSparseMatrix.cpp b/paddle/legacy/math/CpuSparseMatrix.cpp
deleted file mode 100644
index 20c65a3a1..000000000
--- a/paddle/legacy/math/CpuSparseMatrix.cpp
+++ /dev/null
@@ -1,787 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CpuSparseMatrix.h"
-#include "SparseMatrix.h"
-#include "float.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
-
-CpuSparseMatrix::CpuSparseMatrix(size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(dataHandle, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(real* data,
-                                 int* rows,
-                                 int* cols,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  cols_ = cols;
-  rows_ = rows;
-  value_ = data;
-  height_ = height;
-  width_ = width;
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-}
-
-void CpuSparseMatrix::resize(size_t newHeight,
-                             size_t newWidth,
-                             size_t newNnz,
-                             SparseValueType valueType,
-                             SparseFormat format) {
-  CHECK_LE(newNnz, newHeight * newWidth);
-  size_t newSize = 0;
-  if (format == SPARSE_CSR) {
-    newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
-  } else {
-    newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = format;
-  sparseResize();
-}
-void CpuSparseMatrix::sparseResize() {
-  if (format_ == SPARSE_CSR) {
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  } else {
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  }
-}
-
-void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight,
-         newWidth,
-         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
-         valueType_,
-         format_);
-}
-
-MatrixPtr CpuSparseMatrix::getTranspose() {
-  if (!memoryHandle_ && !value_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        height_, width_, elementCnt_, valueType_, format_, true));
-    return dest;
-  } else if (memoryHandle_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        elementCnt_,
-        valueType_,
-        format_,
-        true));
-    return dest;
-  } else if (value_) {
-    MatrixPtr dest(new CpuSparseMatrix(value_,
-                                       rows_,
-                                       cols_,
-                                       height_,
-                                       width_,
-                                       elementCnt_,
-                                       valueType_,
-                                       format_,
-                                       true));
-    return dest;
-  } else {
-    return NULL;
-  }
-}
-
-SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
-
-void CpuSparseMatrix::mul(const Matrix& a,
-                          const Matrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::add3(CpuMatrix* b) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b->getHeight());
-  CHECK(width_ == b->getWidth());
-  real* A = getValue();
-  real* B = b->getData();
-  int* cols = getCols();
-  for (size_t i = 0; i < height_; i++) {
-    size_t start = getRowStartIdx(i);
-    size_t end = getRowStartIdx(i + 1);
-    for (size_t j = start; j < end; j++) {
-      A[j] = B[i * width_ + cols[j]];
-    }
-  }
-}
-
-void CpuSparseMatrix::add3(MatrixPtr b) {
-  if (dynamic_cast<CpuMatrix*>(b.get())) {
-    add3(dynamic_cast<CpuMatrix*>(b.get()));
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::addBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getData();
-  int* cols = getCols();
-  size_t nnz = getElementCnt();
-  for (size_t i = 0; i < nnz; i++) {
-    A[i] += scale * B[cols[i]];
-  }
-}
-
-template <class T>
-void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
-  os << "\n: " << name << " [";
-  for (size_t i = 0; i < len; i++) {
-    os << a[i] << " ";
-  }
-  os << "]\n";
-}
-
-void CpuSparseMatrix::print(std::ostream& os) const {
-  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
-  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
-  printBuf(os, rows_, rowSize, "row");
-  printBuf(os, cols_, colSize, "col");
-  if (valueType_ == FLOAT_VALUE) {
-    printBuf(os, value_, elementCnt_, "value");
-  }
-  return;
-}
-
-void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  if (format_ == SPARSE_CSC) {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-    return;
-  }
-
-  const int* col = getRowCols(idx);
-  size_t num = getColNum(idx);
-  if (num > 0) {
-    if (valueType_ == FLOAT_VALUE) {
-      const real* data = getRowValues(idx);
-      os << col[0] << ":" << data[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i] << ":" << data[i];
-      }
-    } else {
-      os << col[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i];
-      }
-    }
-  }
-  os << ";";
-}
-
-void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK_EQ(height_, b.getHeight());
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getValue();
-  if (b.getValueType() == FLOAT_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = B[j] * c.getElement(i, cCol);
-      }
-    }
-  } else if (b.getValueType() == NO_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = c.getElement(i, cCol);
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::randomizeUniform() {
-  CHECK_LE(elementCnt_, height_ * width_);
-  if (valueType_ == FLOAT_VALUE) {
-    real* data = getValue();
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      *data++ = rand() / static_cast<real>(RAND_MAX);  // NOLINT
-    }
-  }
-  if (format_ == SPARSE_CSR) {
-    sparseRand(rows_, cols_, elementCnt_, height_ + 1, width_, false);
-  } else {
-    sparseRand(cols_, rows_, elementCnt_, width_ + 1, height_, false);
-  }
-}
-
-void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
-                               std::vector<int>& cols,
-                               std::vector<real>& values) {
-  size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
-  resize(height_, width_, size, valueType_, format_);
-  if (valueType_ == FLOAT_VALUE) {
-    memcpy(&value_[0], &values[0], sizeof(real) * values.size());
-  }
-  memcpy(&cols_[0], &cols[0], sizeof(int) * cols.size());
-  memcpy(&rows_[0], &rows[0], sizeof(int) * rows.size());
-}
-
-// Copy from a CpuMatrix, only supported in sparse_float_value_t
-// SparseMatrix.
-void CpuSparseMatrix::copyFrom(const CpuMatrix& src) {
-  CHECK_EQ(getHeight(), src.getHeight());
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK(!src.trans_ && !trans_);
-  if (format_ == SPARSE_CSR) {
-    std::vector<int> rows(getHeight() + 1);
-    std::vector<int> cols;
-    std::vector<real> values;
-    rows[0] = 0;
-    for (size_t r = 0; r < getHeight(); ++r) {
-      for (size_t c = 0; c < getWidth(); ++c) {
-        real v = src.getElement(r, c);
-        if (fabs(v) > FLT_EPSILON) {
-          cols.push_back(c);
-          values.push_back(v);
-        }
-      }
-      rows[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  } else {
-    std::vector<int> cols(getWidth() + 1);
-    std::vector<int> rows;
-    std::vector<real> values;
-    cols[0] = 0;
-    for (size_t r = 0; r < getWidth(); ++r) {
-      for (size_t c = 0; c < getHeight(); ++c) {
-        real v = src.getElement(c, r);
-        if (fabs(v) > FLT_EPSILON) {
-          rows.push_back(c);
-          values.push_back(v);
-        }
-      }
-      cols[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-  CHECK(width && height);
-  if (!useGpu) {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, 0, valueType_, format_);
-  } else {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, elementCnt_, valueType_, format_);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
-  CHECK_LE(startRow + numRows, height_);
-  CHECK_EQ(format_, SPARSE_CSR);
-  if (valueType_ == NO_VALUE) {
-    return std::make_shared<CpuSparseMatrix>(
-        nullptr,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        value_,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  }
-}
-
-/* mem MUST be alloced outside (memAlloc=false) */
-void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  CHECK(!memAlloc);
-  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
-  if (format_ == SPARSE_CSR) {
-    /*statistic element number in each col*/
-    int* colCounters = mat->getRows() + 1;
-    memset(colCounters, 0, sizeof(int) * width_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int col = cols_[i];
-      colCounters[col]++;
-    }
-    /*fill mat rows */
-    mat->getRows()[0] = 0;
-    for (size_t i = 1; i < width_ + 1; i++) {
-      mat->getRows()[i] = mat->getRows()[i - 1] + mat->getRows()[i];
-    }
-    /*fill mat values and cols*/
-    std::vector<int> colNumVec(width_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          mat->getValue()[index] = value_[j];
-          colNumVec[colIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          colNumVec[colIdx]++;
-        }
-      }
-    }
-  } else {
-    /*statistic element number in each row*/
-    int* rowCounters = mat->getCols() + 1;
-    memset(rowCounters, 0, sizeof(int) * height_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int row = rows_[i];
-      rowCounters[row]++;
-    }
-
-    /*fill mat cols */
-    mat->getCols()[0] = 0;
-    for (size_t i = 1; i < height_ + 1; i++) {
-      mat->getCols()[i] = mat->getCols()[i - 1] + mat->getCols()[i];
-    }
-    /*fill mat values and rows*/
-    std::vector<int> rowNumVec(height_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          mat->getValue()[index] = value_[j];
-          rowNumVec[rowIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          rowNumVec[rowIdx]++;
-        }
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::setRow(size_t row,
-                             size_t colNum,
-                             const unsigned int* cols,
-                             const real* values) {
-  if (format_ == SPARSE_CSR) {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    if (0 == row) {
-      rows_[row] = 0;
-    }
-    rows_[row + 1] = rows_[row] + colNum;
-    for (size_t i = 0; i < colNum; ++i) {
-      cols_[rows_[row] + i] = cols[i];
-    }
-    if (valueType_ == NO_VALUE) {
-      CHECK(!values);
-    } else {
-      for (size_t i = 0; i < colNum; ++i) {
-        value_[rows_[row] + i] = values[i];
-      }
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::fillRowIndices(IVectorPtr& outVec) const {
-  if (format_ == SPARSE_CSR) {
-    auto nnz = getElementCnt();
-    IVector::resizeOrCreate(outVec, nnz, false);
-    auto out = outVec->getData();
-    int* rows = getRows();
-    for (size_t i = 0; i < height_; i++) {
-      for (int j = rows[i]; j < rows[i + 1]; j++) {
-        out[j] = i;
-      }
-    }
-  } else {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-  }
-}
-
-ThreadLocal<std::vector<CpuSparseMatrixPtr>> CpuSparseMatrix::cpuLocalMats_;
-
-CpuSparseMatrixPtr CpuSparseMatrix::getTmpSparseMatrix(size_t height,
-                                                       size_t width) {
-  std::vector<CpuSparseMatrixPtr>* localMats = cpuLocalMats_.get();
-  auto it = localMats->begin();
-  while (it != localMats->end()) {
-    if (it->unique()) {
-      (*it)->resize(height, width, elementCnt_, valueType_, format_);
-      return *it;
-    }
-  }
-  localMats->emplace_back(std::make_shared<CpuSparseMatrix>(
-      height, width, elementCnt_, valueType_, format_, false));
-  return localMats->back();
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  if (dynamic_cast<const GpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const GpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc, stream);
-  } else if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src) {
-  if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
-  size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
-  if (format_ == SPARSE_CSC)
-    hl_memcpy_from_csc_matrix(value_,
-                              valSize,
-                              rows_,
-                              elementCnt_,
-                              cols_,
-                              width_ + 1,
-                              src.sMatrix_.get(),
-                              stream);
-  else
-    hl_memcpy_from_csr_matrix(value_,
-                              valSize,
-                              rows_,
-                              height_ + 1,
-                              cols_,
-                              elementCnt_,
-                              src.sMatrix_.get(),
-                              stream);
-}
-
-void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  int start = format_ == SPARSE_CSR ? src.getRows()[0] : src.getCols()[0];
-  if (format_ == SPARSE_CSR) {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      totalColNum += src.getColNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    rows_[0] = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      rows_[i + 1] = rows_[i] + src.getColNum(i);
-    }
-    memcpy(cols_, src.getCols() + start, totalColNum * sizeof(int));
-  } else {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      totalColNum += src.getRowNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    cols_[0] = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      cols_[i + 1] = cols_[i] + src.getRowNum(i);
-    }
-    memcpy(rows_, src.getRows() + start, totalColNum * sizeof(int));
-  }
-
-  // if have different value type, only copy rows and cols
-  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    memcpy(value_, src.getValue() + start, elementCnt_ * sizeof(real));
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_non_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_float_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-    value_[offsets + j] = row[j].value;
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data) {
-  size_t totalColNum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    totalColNum += indices[id + 1] - indices[id];
-  }
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    T* row = data + indices[id];
-    size_t colNum = indices[id + 1] - indices[id];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* indices, T* data) {
-  CHECK(format_ == SPARSE_CSR);
-  size_t totalColNum = indices[height_] - indices[0];
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    T* row = data + indices[i];
-    size_t colNum = indices[i + 1] - indices[i];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_LE(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  CHECK_EQ(valueType_, src.getValueType());
-  if (format_ == SPARSE_CSR) {
-    int* srcCols = src.getCols();
-    size_t numLessWidth =
-        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
-          return n < this->width_;
-        });
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    rows_[0] = 0;
-    size_t index = 0;
-    for (size_t r = 0; r < height_; ++r) {
-      for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
-        if (srcCols[i] < static_cast<int>(width_)) {
-          cols_[index] = srcCols[i];
-          if (valueType_ == FLOAT_VALUE) {
-            value_[index] = src.getValue()[i];
-          }
-          ++index;
-        }
-      }
-      rows_[r + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  } else {
-    size_t numLessWidth = src.getCols()[width_] - src.getCols()[0];
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    cols_[0] = 0;
-    size_t index = 0;
-    // note: c < width_, not src.getWidth();
-    for (size_t c = 0; c < width_; ++c) {
-      for (int i = src.getCols()[c]; i < src.getCols()[c + 1]; ++i) {
-        rows_[index] = src.getRows()[i];
-        if (valueType_ == FLOAT_VALUE) {
-          value_[index] = src.getValue()[i];
-        }
-        ++index;
-      }
-      cols_[c + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  }
-}
-
-void CpuSparseMatrix::zeroMem() {
-  CHECK(valueType_ == FLOAT_VALUE);
-  memset(value_, 0, elementCnt_ * sizeof(real));
-}
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_float_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_float_value_t* data);
-
-void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  maxVal.zeroMem();
-  int* outids = maxIds.getData();
-  real* outvalues = maxVal.getData();
-
-  typedef std::pair<real, size_t> valuepair;
-  std::vector<valuepair> vec;
-  for (size_t i = 0; i < numSamples; i++) {
-    vec.clear();
-
-    auto num = getColNum(i);
-    auto ids = getRowCols(i);
-    auto values = getRowValues(i);
-    for (size_t j = 0; j < num; j++) {
-      vec.push_back(std::make_pair(values[j], ids[j]));
-    }
-
-    size_t outsize = std::min(num, beam);
-    std::partial_sort(vec.begin(),
-                      vec.begin() + outsize,
-                      vec.end(),
-                      [](const valuepair& a, const valuepair& b) {
-                        return a.first > b.first;
-                      });
-    for (size_t j = 0; j < outsize; j++) {
-      outids[i * beam + j] = vec[j].second;
-      outvalues[i * beam + j] = vec[j].first;
-    }
-    if (outsize < beam) {
-      // if the number of values to sort are less than the output size,
-      // use -1 to indicate the end of valid sorted values.
-      outids[i * beam + outsize] = -1;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/CpuSparseMatrix.h b/paddle/legacy/math/CpuSparseMatrix.h
deleted file mode 100644
index 172792c29..000000000
--- a/paddle/legacy/math/CpuSparseMatrix.h
+++ /dev/null
@@ -1,377 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <cstddef>
-#include "Matrix.h"
-
-namespace paddle {
-
-class CpuSparseMatrix : public Matrix {
- public:
-  CpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format = SPARSE_CSR,
-                  bool trans = false);
-
-  CpuSparseMatrix(CpuMemHandlePtr memHandle,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans);
-
-  CpuSparseMatrix(real* data,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans);
-
-  ~CpuSparseMatrix() {}
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format);
-  void resize(size_t newHeight, size_t newWidth);
-
-  MatrixPtr getTranspose();
-
-  SparseValueType getValueType();
-
-  real* getRowValues(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return value_ + rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  int* getRowCols(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return cols_ + rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  /// fill row indices of each value in CSR matrix
-  void fillRowIndices(IVectorPtr& outVec) const;
-
-  size_t getColNum(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return rows_[i + 1] - rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  real* getColumn(size_t i) const {
-    if (format_ == SPARSE_CSC) {
-      return value_ + cols_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSR not supported";
-      return 0;
-    }
-  }
-
-  size_t getColStartIdx(size_t i) const {
-    if (format_ == SPARSE_CSC) {
-      return cols_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSR not supported";
-      return 0;
-    }
-  }
-
-  size_t getRowStartIdx(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  size_t getRowNum(size_t i) const {
-    if (format_ == SPARSE_CSC) {
-      return cols_[i + 1] - cols_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSR not supported";
-      return 0;
-    }
-  }
-
-  virtual real getSum() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return elementCnt_;
-    }
-    double sum = 0;
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      sum += value_[i];
-    }
-    return sum;
-  }
-
-  virtual void square2() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return;
-    }
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      value_[i] = value_[i] * value_[i];
-    }
-  }
-
-  /**
-   * only consider nonzero values.
-   * the actual min value should compare with 0.0.
-   */
-  virtual real getMin() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return (elementCnt_ > 0 ? 1.0 : 0.0);
-    }
-    real min = value_[0];
-    for (size_t i = 1; i < elementCnt_; ++i) {
-      min = value_[i] < min ? value_[i] : min;
-    }
-    return min;
-  }
-
-  /**
-   * only consider nonzero values.
-   * the actual max value should compare with 0.0.
-   */
-  virtual real getMax() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return (elementCnt_ > 0 ? 1.0 : 0.0);
-    }
-    real max = value_[0];
-    for (size_t i = 1; i < elementCnt_; ++i) {
-      max = value_[i] > max ? value_[i] : max;
-    }
-    return max;
-  }
-
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  int* getRows() const { return rows_; }
-  int* getCols() const { return cols_; }
-  real* getValue() const { return value_; }
-  SparseFormat getFormat() const { return format_; }
-  SparseValueType getValueType() const { return valueType_; }
-
-  /**
-   * @brief return value_ of sparse matrix
-   *
-   * Some times CpuSparseMatrix maybe Matrix,
-   * if getValue, must dynamic_cast to CpuSparseMatrix,
-   * getData is convenient to get value
-   */
-  real* getData() { return getValue(); }
-  const real* getData() const { return getValue(); }
-
-  /**
-   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
-   */
-  void zeroMem();
-
-  /// mem MUST be alloced outside (memAlloc=false)
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-
-  void mul(const Matrix& A, const Matrix& B, real alpha, real beta);
-
-  /**
-   * @brief sparseMatrix += denseMatrix
-   *
-   *  Named add3 just because add/add2 has been used in BaseMatrix.cu
-   *  and they are not virtual function.
-   *
-   *  Only add value of same (row, col) index in dense matrix
-   *  and do not use others values whoes postions are not in sparse matirx.
-   *
-   * @param[in]  b   dense matrix
-   */
-  void add3(CpuMatrix* b);
-  void add3(MatrixPtr b);
-
-  /**
-   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
-   *
-   * @param[in]  b      bias, dense matrix and height = 1
-   * @param[in]  scale  scale of b
-   */
-  void addBias(Matrix& b, real scale);
-
-  void print(std::ostream& os) const;
-
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values);
-
-  /**
-   * @brief this_row = b_row * c_row[cCol]
-   *
-   * @param[in]  cCol   the column of matrix c used to scale each row of b
-   * @param[in]  b      CpuSparseMatrix
-   * @param[in]  c      Matrix
-   */
-  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
-
-  void randomizeUniform();
-
-  void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream = HPPL_STREAM_DEFAULT);
-
-  void copyFrom(const Matrix& src);
-
-  /**
-   * Get a temporary matrix. This is threadsafe. It should be only used
-   * temporarily, i.e. do not store it or use it as return value.
-   *
-   * @note  Do NOT use large amount of tmp matrix.
-   */
-  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width);
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows);
-
-  void copyFrom(std::vector<int>& rows,
-                std::vector<int>& cols,
-                std::vector<real>& values);
-
-  void copyFrom(const CpuMatrix& src);
-
-  void copyFrom(const CpuSparseMatrix& src);
-
-  // trim the large size
-  void trimFrom(const CpuSparseMatrix& src);
-
-  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
-
-  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
-
-  template <class T>
-  void copyFrom(int64_t* ids, int64_t* indices, T* data);
-
-  template <class T>
-  void copyFrom(int64_t* indices, T* data);
-
-  void copyFrom(const real* data, size_t len) {
-    LOG(FATAL) << "not supported!";
-  }
-
- private:
-  MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false);
-
- protected:
-  void sparseResize();
-  /*for csr , record row start position, for csc, record row index for every no
-   * zero value*/
-  int* rows_;
-  /*for csc , record col start position, for csr, record col index for every no
-   * zero value*/
-  int* cols_;
-  real* value_;               /*nonzero value*/
-  SparseFormat format_;       /* matrix format */
-  SparseValueType valueType_; /*with value or not  */
-  static const size_t DEFAULT_AVG_WIDTH = 20;
-
-  static ThreadLocal<std::vector<CpuSparseMatrixPtr>> cpuLocalMats_;
-
-  // BaseMatrixT interface
- public:
-  bool isSparse() const { return true; }
-
- private:
-  using Matrix::mul;
-  using Matrix::copyFrom;
-  using Matrix::rowMax;
-  using Matrix::print;
-  using Matrix::subMatrix;
-};
-}  // namespace paddle
-
-#else
-
-#include "Matrix.h"
-
-namespace paddle {
-
-class CpuSparseMatrix : public Matrix {
- public:
-  CpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format = SPARSE_CSR,
-                  bool trans = false)
-      : Matrix(NULL, height, width, trans, false) {}
-
-  CpuSparseMatrix(real* data,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans)
-      : Matrix(NULL, height, width, trans, false) {}
-
-  real* getValue() const { return nullptr; }
-  size_t getColStartIdx(size_t i) const { return 0; }
-  size_t getRowStartIdx(size_t i) const { return 0; }
-  size_t getColNum(size_t i) const { return 0; }
-  int* getRowCols(size_t i) const { return nullptr; }
-
-  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
-    return nullptr;
-  }
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {}
-  void resize(size_t newHeight, size_t newWidth) {}
-  MatrixPtr getTranspose() { return nullptr; }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {}
-};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/ExecViaCpu.h b/paddle/legacy/math/ExecViaCpu.h
deleted file mode 100644
index ec2337545..000000000
--- a/paddle/legacy/math/ExecViaCpu.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- execViaCpu is used to do operations on GpuMatirx and/or GpuIVector through
- cpu functions. It can automatically make a temporary CPU copy for the
- gpu matrix/vector, and copy back after executing the CPU function.
-
- Examples:
- 1. For a function, functor or lambda:
-   r = execViaCpu(&f, mat, vec)
-
- 2. For member function of CpuMatirx, execViaCpu2 should be used:
-   execViaCpu2(&CpuMatrix::selectElements, *this, table, ids)
-*/
-
-#pragma once
-
-namespace paddle {
-
-template <typename Arg>
-class CopyToCpu {
- public:
-  explicit CopyToCpu(Arg& arg) : arg_(arg) {}
-  Arg& copiedArg() const { return arg_; }
-
- private:
-  Arg& arg_;
-};
-
-template <>
-class CopyToCpu<Matrix> {
- public:
-  explicit CopyToCpu(Matrix& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(),
-                               arg.getWidth(),
-                               /* trans= */ false,
-                               /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  ~CopyToCpu() {
-    if (copied_) {
-      arg_.copyFrom(*copied_);
-    }
-  }
-  Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
- private:
-  Matrix& arg_;
-  MatrixPtr copied_;
-};
-
-template <>
-class CopyToCpu<const Matrix> {
- public:
-  explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(),
-                               arg.getWidth(),
-                               /* trans= */ false,
-                               /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  const Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
- private:
-  const Matrix& arg_;
-  MatrixPtr copied_;
-};
-
-template <>
-class CopyToCpu<IVector> {
- public:
-  explicit CopyToCpu(IVector& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  ~CopyToCpu() {
-    if (copied_) {
-      arg_.copyFrom(*copied_);
-    }
-  }
-  IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
- private:
-  IVector& arg_;
-  IVectorPtr copied_;
-};
-
-template <>
-class CopyToCpu<const IVector> {
- public:
-  explicit CopyToCpu(const IVector& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  const IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
- private:
-  const IVector& arg_;
-  IVectorPtr copied_;
-};
-
-namespace detail {
-
-template <bool isFunction, bool isFunctionPointer, bool isClass, typename F>
-class GpuFuncWrapperImp;
-
-template <typename F, typename R, typename... Args>
-class GpuFuncWrapperBase {
- public:
-  typedef R ResultType;
-  R operator()(F&& f, Args... args) {
-    return f(CopyToCpu<typename std::remove_reference<Args>::type>(args)
-                 .copiedArg()...);
-  }
-};
-
-// function
-template <typename R, typename... Args>
-class GpuFuncWrapperImp<true, false, false, R(Args...)>
-    : public GpuFuncWrapperBase<R(Args...), R, Args...> {};
-
-// function pointer
-template <typename R, typename... Args>
-class GpuFuncWrapperImp<false, true, false, R (*)(Args...)>
-    : public GpuFuncWrapperBase<R (*)(Args...), R, Args...> {};
-
-template <typename F, typename Op>
-class GpuFuncWrapperImp2;
-
-template <typename F, typename C, typename R, typename... Args>
-class GpuFuncWrapperImp2<F, R (C::*)(Args...) const>
-    : public GpuFuncWrapperBase<F, R, Args...> {};
-
-template <typename F, typename C, typename R, typename... Args>
-class GpuFuncWrapperImp2<F, R (C::*)(Args...)>
-    : public GpuFuncWrapperBase<F, R, Args...> {};
-
-// functor or lambda
-template <typename F>
-class GpuFuncWrapperImp<false, false, true, F>
-    : public GpuFuncWrapperImp2<F, decltype(&F::operator())> {};
-
-template <typename F>
-class GpuFuncWrapper2
-    : public GpuFuncWrapperImp<
-          std::is_function<F>::value,
-          std::is_pointer<F>::value &&
-              std::is_function<typename std::remove_pointer<F>::type>::value,
-          std::is_class<F>::value,
-          F> {};
-
-template <typename F>
-class GpuFuncWrapper
-    : public GpuFuncWrapper2<typename std::remove_reference<F>::type> {};
-
-}  // namespace detail
-
-template <typename F, typename... Args>
-typename detail::GpuFuncWrapper<F>::ResultType execViaCpu(F&& f,
-                                                          Args&&... args) {
-  return detail::GpuFuncWrapper<F>()(std::move(f), args...);
-}
-
-// The second version is for F as member function of CpuMatrix
-template <typename R, typename... FArgs, typename... Args>
-R execViaCpu2(R (CpuMatrix::*f)(FArgs...), Args&&... args) {
-  auto lambda = [](R (CpuMatrix::*f)(FArgs...), Matrix& ths, FArgs... args) {
-    return (((CpuMatrix&)ths).*f)(args...);
-  };
-  return execViaCpu(lambda, f, args...);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MKLDNNMatrix.cpp b/paddle/legacy/math/MKLDNNMatrix.cpp
deleted file mode 100644
index 52036c5f8..000000000
--- a/paddle/legacy/math/MKLDNNMatrix.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNMatrix.h"
-
-using namespace mkldnn;  // NOLINT
-
-namespace paddle {
-
-MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
-  memory::desc md = pd.desc();
-  size_t ndims = md.data.ndims;
-  int* dims = md.data.dims;
-  CHECK(ndims > 0) << "Input dims should not be empty";
-  size_t cnts = 1;
-  for (size_t i = 0; i < ndims; ++i) {
-    cnts *= dims[i];
-  }
-
-  if (m == nullptr) {
-    size_t height = dims[0];
-    size_t width = cnts / dims[0];
-    m = Matrix::create(height, width, false, false);
-  }
-  CHECK(m) << " Matrix should not be empty";
-
-  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
-  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
-  CHECK_EQ(cpuMatrix->getElementCnt(), cnts) << "Count size does not match";
-  return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
-}
-
-MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
-                                     memory::format fmt,
-                                     engine& eg,
-                                     MatrixPtr m,
-                                     mkldnn::memory::data_type dtype) {
-  return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
-}
-
-std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
-                                                     const MKLDNNMatrixPtr& dst,
-                                                     bool checkData) {
-  if (src == dst || src->getPrimitiveDesc() == dst->getPrimitiveDesc()) {
-    return nullptr;
-  }
-
-  if (checkData && (src->getData() == dst->getData())) {
-    LOG(FATAL) << "can not create reorder with inplace data";
-    return nullptr;
-  }
-
-  memory::dims srcDims = src->getDims();
-  memory::dims dstDims = dst->getDims();
-  CHECK_EQ(srcDims.size(), dstDims.size());
-  for (size_t i = 0; i < srcDims.size(); ++i) {
-    CHECK_EQ(srcDims[i], dstDims[i]);
-  }
-  return std::make_shared<reorder>(*src, *dst);
-}
-
-void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
-                                   memory::format srcFmt,
-                                   memory::dims targetDim) {
-  memory::format dstFmt = getFormat();
-  if (srcFmt == dstFmt) {
-    return;
-  }
-  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
-}
-
-void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
-                                 memory::format dstFmt,
-                                 memory::dims targetDim) {
-  memory::format srcFmt = getFormat();
-  if (srcFmt == dstFmt) {
-    return;
-  }
-  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
-}
-
-void MKLDNNMatrix::reorderOnce(void* srcData,
-                               void* dstData,
-                               memory::format srcFmt,
-                               memory::format dstFmt,
-                               memory::dims dm) {
-  CHECK(srcData);
-  CHECK(dstData);
-  MatrixPtr tmpSrc;
-  if (dstData == srcData) {
-    // inplace data
-    size_t sz = 1;
-    for (size_t i = 0; i < dm.size(); ++i) {
-      sz *= dm[i];
-    }
-    tmpSrc = Matrix::create(sz, 1, false, false);
-    tmpSrc->copyFrom((real*)srcData, sz);
-    srcData = tmpSrc->getData();
-  }
-
-  auto dtype = this->getDtype();
-  auto srcMD = memory::desc(dm, dtype, srcFmt);
-  auto dstMD = memory::desc(dm, dtype, dstFmt);
-
-  auto eg = this->getEngine();
-  auto src = memory(memory::primitive_desc(srcMD, eg), srcData);
-  auto dst = memory(memory::primitive_desc(dstMD, eg), dstData);
-
-  auto r = reorder(src, dst);
-  stream(stream::kind::eager).submit({r}).wait();
-}
-
-void MKLDNNMatrix::downSpatial() {
-  int fmt = getFormat();
-  if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
-    // only support nchw and oihw yet, later can support more like nhwc, ihwo
-    return;
-  }
-
-  // TODO(TJ): change H(height) and W(width) if support nhwc or more
-  const int H = 2, W = 3;
-  memory::dims srcDims = getDims();
-  if (srcDims[H] != 1 || srcDims[W] != 1) {
-    // can not down spatial
-    return;
-  }
-
-  memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
-  memory::format dstFmt;
-  switch (fmt) {
-    case memory::format::nchw:
-      dstFmt = memory::format::nc;
-      break;
-    case memory::format::oihw:
-      dstFmt = memory::format::oi;
-      break;
-    default:
-      LOG(FATAL) << "unsupported format";
-  }
-  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
-  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
-  resetMKLDNNMemory(pd, data_);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MKLDNNMatrix.h b/paddle/legacy/math/MKLDNNMatrix.h
deleted file mode 100644
index 5a0e5f859..000000000
--- a/paddle/legacy/math/MKLDNNMatrix.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Matrix.h"
-#include "mkldnn.hpp"
-#include "paddle/legacy/parameter/Parameter.h"
-
-namespace paddle {
-
-class MKLDNNMatrix;
-typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
-
-#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
-  CHECK(MAT) << " can not be empty.";                                \
-  CHECK(MAT->getPrimitiveDesc() == PD)                               \
-      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
-      << "" __VA_ARGS__;
-
-/**
- * @brief MKLDNN Matrix.
- *
- */
-class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
- public:
-  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
-        mkldnn::memory(pd, m->getData()),
-        m_(m) {}
-
-  ~MKLDNNMatrix() {}
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
-   */
-  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
-                                MatrixPtr m = nullptr);
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory details info
-   */
-  static MKLDNNMatrixPtr create(
-      mkldnn::memory::dims dims,
-      mkldnn::memory::format fmt,
-      mkldnn::engine& eg,
-      MatrixPtr m = nullptr,
-      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
-
-  /**
-   * Create primitive descriptor.
-   * default with f32 dtype
-   */
-  static mkldnn::memory::primitive_desc createPrimitiveDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt,
-      const mkldnn::engine& eg,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
-  }
-
-  /**
-   * Create Memory descriptor.
-   * default with any format and f32 dtype
-   */
-  static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::desc(dims, dtype, fmt);
-  }
-
-  /**
-   * Create reorder primitive.
-   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
-   * checkData: whether to check the data handle of src and dst.
-   *            if true, it will check the data and do not allow them equal;
-   *            otherwise, it will not check them, then the reorder created
-   *            may have inplace buffer.
-   *            Do not set false, if you can not guarantee the inplace logical
-   *            would work with your reorder.
-   */
-  static std::shared_ptr<mkldnn::reorder> createReorder(
-      const MKLDNNMatrixPtr& src,
-      const MKLDNNMatrixPtr& dst,
-      bool checkData = true);
-
-  void copyFrom(const Matrix& src) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    m_->copyFrom(src);
-  }
-
-  void copyTo(Matrix& dst) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    dst.copyFrom(*m_);
-  }
-
- public:
-  /**
-   * Reorder this MKLDNNMatrix from other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change this original dim or format info
-   */
-  void reorderDataFrom(const MKLDNNMatrixPtr& m,
-                       memory::format srcFmt,
-                       memory::dims targetDim);
-
-  /**
-   * Reorder this MKLDNNMatrix to other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change the dst dim or format info
-   */
-  void reorderDataTo(const MKLDNNMatrixPtr& m,
-                     memory::format dstFmt,
-                     memory::dims targetDim);
-
-  /**
-   * Dimensionality reduction.
-   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
-   */
-  void downSpatial();
-
-  /**
-   * set the memory data handle.
-   * Caution: This will not check the buffer size of the data,
-   *          it should be coverd by user.
-   */
-  void setData(real* data) {
-    set_data_handle(data);
-    CpuMatrix::setData(data);
-    m_.reset();
-  }
-
-  /**
-   * override the CpuMatrix::resize
-   */
-  void resize(size_t newHeight, size_t newWidth) override {
-    m_->resize(newHeight, newWidth);
-    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
-      return;
-    }
-    CpuMatrix::setData(data_);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-    auto pd = mkldnn::memory::primitive_desc(
-        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
-                             getDtype(),
-                             mkldnn::memory::format::nc),
-        getEngine());
-    resetMKLDNNMemory(pd, data_);
-  }
-
-  /**
-   * override Matrix::getData
-   * check data before return
-   */
-  real* getData() override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  const real* getData() const override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  /**
-   * Get primitive descriptor.
-   */
-  mkldnn::memory::primitive_desc getPrimitiveDesc() {
-    return this->get_primitive_desc();
-  }
-
-  /**
-   * Get memory descriptor.
-   */
-  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
-
-  /**
-   * Get dimensions.
-   */
-  mkldnn::memory::dims getDims() {
-    mkldnn::memory::desc md = getMemoryDesc();
-    const int* src = md.data.dims;
-    int ndims = md.data.ndims;
-    mkldnn::memory::dims dst;
-    dst.resize(ndims);
-    for (int i = 0; i < ndims; ++i) {
-      dst[i] = src[i];
-    }
-    return dst;
-  }
-
-  /**
-   * Get format.
-   */
-  mkldnn::memory::format getFormat() {
-    return (mkldnn::memory::format)(getMemoryDesc().data.format);
-  }
-
-  /**
-   * Get memory data type.
-   */
-  mkldnn::memory::data_type getDtype() {
-    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
-  }
-
-  /**
-   * Get engine.
-   */
-  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
-
- protected:
-  /**
-   * Do reorder once.
-   * Can support inplace.
-   */
-  void reorderOnce(void* srcData,
-                   void* dstData,
-                   memory::format srcFmt,
-                   memory::format dstFmt,
-                   memory::dims dm);
-  /**
-   * reset this MKLDNN Memory from primitve desc
-   */
-  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
-    mkldnn_primitive_t result;
-    mkldnn::error::wrap_c_api(
-        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
-        "could not create a memory primitive");
-    reset(result);
-    set_data_handle(data);
-  }
-
- private:
-  // save the CpuMatrixPtr in case the buffer released outside
-  CpuMatrixPtr m_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MathFunctions.cpp b/paddle/legacy/math/MathFunctions.cpp
deleted file mode 100644
index bbf34a32f..000000000
--- a/paddle/legacy/math/MathFunctions.cpp
+++ /dev/null
@@ -1,348 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/math/MathFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "paddle/legacy/utils/DynamicLoader.h"
-
-namespace dynload {
-
-std::once_flag lapack_dso_flag;
-void* lapack_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load lapack routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-
-// The argument for stringizing operator is not macro-expanded first.
-// We have to use two levels of macro to do the expansion.
-// See https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
-#define STR(x) #x
-
-// clang-format off
-#ifndef LAPACK_FOUND
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
-      void* p_##__name = dlsym(lapack_dso_handle, STR(__name));                \
-      CHECK(p_##__name) << "Cannot find symbol " << STR(__name)                \
-                        << " in liblapack.so";                                 \
-      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      return __name(args...);                                                  \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#endif
-
-#define  PADDLE_SGETRF  LAPACKE_sgetrf
-#define  PADDLE_DGETRF  LAPACKE_dgetrf
-#define  PADDLE_SGETRI  LAPACKE_sgetri
-#define  PADDLE_DGETRI  LAPACKE_dgetri
-
-#define LAPACK_ROUTINE_EACH(__macro)       \
-  __macro(PADDLE_SGETRF)                   \
-  __macro(PADDLE_DGETRF)                   \
-  __macro(PADDLE_SGETRI)                   \
-  __macro(PADDLE_DGETRI)
-// clang-format on
-
-LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
-
-}  // namespace dynload
-
-namespace paddle {
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void gemm<float>(const CBLAS_TRANSPOSE transA,
-                 const CBLAS_TRANSPOSE transB,
-                 const int M,
-                 const int N,
-                 const int K,
-                 const float alpha,
-                 const float* A,
-                 const int lda,
-                 const float* B,
-                 const int ldb,
-                 const float beta,
-                 float* C,
-                 const int ldc) {
-  cblas_sgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-
-template <>
-void gemm<double>(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE transB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const double alpha,
-                  const double* A,
-                  const int lda,
-                  const double* B,
-                  const int ldb,
-                  const double beta,
-                  double* C,
-                  const int ldc) {
-  cblas_dgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-#endif
-
-template <>
-int getrf<float>(const CBLAS_ORDER order,
-                 const int M,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 int* ipiv) {
-  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getrf<double>(const CBLAS_ORDER order,
-                  const int M,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  int* ipiv) {
-  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getri<float>(const CBLAS_ORDER order,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 const int* ipiv) {
-  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
-}
-
-template <>
-int getri<double>(const CBLAS_ORDER order,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  const int* ipiv) {
-  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
-}
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void axpy<float>(const int n, const float alpha, const float* x, float* y) {
-  cblas_saxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-void axpy<double>(const int n, const double alpha, const double* x, double* y) {
-  cblas_daxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-float dotProduct<float>(const int n, const float* x, const float* y) {
-  return cblas_sdot(n, x, 1, y, 1);
-}
-
-template <>
-double dotProduct<double>(const int n, const double* x, const double* y) {
-  return cblas_ddot(n, x, 1, y, 1);
-}
-#endif
-
-#if defined(PADDLE_WITH_MKLML)
-
-template <>
-void vExp<float>(const int n, const float* a, float* r) {
-  vsExp(n, a, r);
-}
-
-template <>
-void vExp<double>(const int n, const double* a, double* r) {
-  vdExp(n, a, r);
-}
-
-template <>
-void vPow<float>(const int n, const float* a, const float b, float* r) {
-  vsPowx(n, a, b, r);
-}
-
-template <>
-void vPow<double>(const int n, const double* a, const double b, double* r) {
-  vdPowx(n, a, b, r);
-}
-
-template <>
-void vLog<float>(const int n, const float* a, float* r) {
-  vsLn(n, a, r);
-}
-
-template <>
-void vLog<double>(const int n, const double* a, double* r) {
-  vdLn(n, a, r);
-}
-
-template <>
-void vAdd<float>(const int n, const float* a, const float* b, float* r) {
-  vsAdd(n, a, b, r);
-}
-
-template <>
-void vAdd<double>(const int n, const double* a, const double* b, double* r) {
-  vdAdd(n, a, b, r);
-}
-
-template <>
-void vTanh<float>(const int n, const float* a, float* r) {
-  vsTanh(n, a, r);
-}
-
-template <>
-void vTanh<double>(const int n, const double* a, double* r) {
-  vdTanh(n, a, r);
-}
-
-template <>
-void vInvSqrt<float>(const int n, const float* a, float* r) {
-  vsInvSqrt(n, a, r);
-}
-
-template <>
-void vInvSqrt<double>(const int n, const double* a, double* r) {
-  vdInvSqrt(n, a, r);
-}
-
-template <>
-void vLog1p<float>(const int n, const float* a, float* r) {
-  vsLog1p(n, a, r);
-}
-
-template <>
-void vLog1p<double>(const int n, const double* a, double* r) {
-  vdLog1p(n, a, r);
-}
-#else
-
-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
-template <class T>
-void vInvSqrt(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
-      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
-template <class T>
-void vLog1p(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
-      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <class T>
-void vTanh(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
-      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
-template void vInvSqrt(const int n, const double* a, double* r);
-template void vInvSqrt(const int n, const float* a, float* r);
-template void vLog1p(const int n, const float* a, float* r);
-template void vLog1p(const int n, const double* a, double* r);
-template void vTanh(const int n, const float* a, float* r);
-template void vTanh(const int n, const double* a, double* r);
-#endif
-}  // namespace paddle
diff --git a/paddle/legacy/math/MathFunctions.h b/paddle/legacy/math/MathFunctions.h
deleted file mode 100644
index 854e4baa3..000000000
--- a/paddle/legacy/math/MathFunctions.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
-#include <mkl_lapacke.h>
-#include <mkl_vml_functions.h>
-#endif
-
-#ifdef PADDLE_USE_VECLIB
-extern "C" {
-#include <cblas.h>
-#include <clapack.h>
-}
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#ifdef LAPACK_FOUND
-#include <lapacke.h>
-#endif
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-#include <cblas.h>
-#else
-typedef enum CBLAS_ORDER {
-  CblasRowMajor = 101,
-  CblasColMajor = 102
-} CBLAS_ORDER;
-#endif
-int LAPACKE_sgetrf(
-    int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
-int LAPACKE_dgetrf(
-    int matrix_layout, int m, int n, double* a, int lda, int* ipiv);
-int LAPACKE_sgetri(
-    int matrix_layout, int n, float* a, int lda, const int* ipiv);
-int LAPACKE_dgetri(
-    int matrix_layout, int n, double* a, int lda, const int* ipiv);
-}
-#endif
-
-#include <cmath>
-
-namespace paddle {
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <class T>
-void gemm(const CBLAS_TRANSPOSE transA,
-          const CBLAS_TRANSPOSE transB,
-          const int M,
-          const int N,
-          const int K,
-          const T alpha,
-          const T* A,
-          const int lda,
-          const T* B,
-          const int ldb,
-          const T beta,
-          T* C,
-          const int ldc);
-#endif
-
-template <class T>
-int getrf(const CBLAS_ORDER Order,
-          const int M,
-          const int N,
-          T* A,
-          const int lda,
-          int* ipiv);
-
-template <class T>
-int getri(
-    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
-
-template <class T>
-void axpy(const int n, const T alpha, const T* x, T* y) {
-  /// y = y + alpha * x
-  for (int i = 0; i < n; i++) {
-    y[i] = y[i] + alpha * x[i];
-  }
-}
-
-template <class T>
-T dotProduct(const int n, const T* x, const T* y) {
-  T result = static_cast<T>(0);
-  for (int i = 0; i < n; i++) {
-    result += x[i] * y[i];
-  }
-  return result;
-}
-
-template <class T>
-void vExp(const int n, const T* a, T* r);
-
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r);
-
-template <class T>
-void vLog(const int n, const T* a, T* r);
-
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r);
-
-template <class T>
-void vInvSqrt(const int n, const T* a, T* r);
-
-template <class T>
-void vLog1p(const int n, const T* a, T* r);
-
-template <class T>
-void vTanh(const int n, const T* a, T* r);
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MathUtils.cpp b/paddle/legacy/math/MathUtils.cpp
deleted file mode 100644
index 47ac9c187..000000000
--- a/paddle/legacy/math/MathUtils.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MathUtils.h"
-#include <algorithm>
-#include "Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/*if csc, major is cols and minor is rows, else
- * major is rows and minor is cols, according to
- * major value to initialize minor value"
- */
-void sparseRand(
-    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
-  CHECK(size_t(nnz) >= size_t(1));
-  int* cpuMajor;
-  int* cpuMinor;
-  CpuIVector cpuMinorVec(nnz);
-  CpuIVector cpuMajorVec(majorLen);
-  if (useGpu) {
-    cpuMajor = cpuMajorVec.getData();
-    cpuMinor = cpuMinorVec.getData();
-  } else {
-    cpuMajor = major;
-    cpuMinor = minor;
-  }
-
-  /*major value init*/
-  for (int i = 0; i < majorLen - 1; i++) {
-    cpuMajor[i] = 1.0 * i * nnz / (majorLen - 1);
-  }
-  cpuMajor[majorLen - 1] = nnz;
-
-  /*minor value init according to major value*/
-  std::vector<char> used(minorMax, 0);
-  for (int i = 0; i < majorLen - 1; i++) {
-    CHECK_LE(cpuMajor[i + 1] - cpuMajor[i], minorMax);
-    used.assign(minorMax, 0);
-    for (int j = cpuMajor[i]; j < cpuMajor[i + 1]; j++) {
-      int idx = ::rand() % minorMax;
-      while (used[idx]) {
-        idx = ::rand() % minorMax;
-      }
-      cpuMinor[j] = idx;
-      used[idx] = 1;
-    }
-    std::sort(cpuMinor + cpuMajor[i],
-              cpuMinor + cpuMajor[i + 1],
-              [](int a, int b) { return a < b; });
-  }
-  /*memcpy result to gpu*/
-  if (useGpu) {
-    hl_memcpy_host2device(major, cpuMajor, sizeof(int) * majorLen);
-    hl_memcpy_host2device(minor, cpuMinor, sizeof(int) * nnz);
-  }
-}
-
-int outputSize(
-    int imageSize, int filterSize, int padding, int stride, bool caffeMode) {
-  int outputSize;
-  if (!caffeMode) {
-    outputSize =
-        (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
-  } else {
-    outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
-  }
-  CHECK_GE(outputSize, 1);
-  return outputSize;
-}
-
-int imageSize(
-    int outputSize, int filterSize, int padding, int stride, bool caffeMode) {
-  int imageSize;
-  if (!caffeMode) {
-    imageSize =
-        (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
-  } else {
-    imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
-  }
-  CHECK_GE(imageSize, 1);
-  return imageSize;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MathUtils.h b/paddle/legacy/math/MathUtils.h
deleted file mode 100644
index 597485d9c..000000000
--- a/paddle/legacy/math/MathUtils.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-
-/**
- * this function is for SparseMatrix initialization except data.
- * It generates a random non-zero pattern for a sparse matrix.
- *
- * if format is SPARSE_CSC,
- *    major is column start index and minor is row index
- *    for each non zero value.
- * else
- *    major is row start index and minor is col
- *    index for each non zero value.
- *
- * Initialize minor value according to major value.
- *
- * For example, A is 5*3  CSC matrix, nnz is 10, then
- *
- * @code
- *   cols[i] = i * nnz / 3
- *   cols=[0, 3, 6, 10]
- * @endcode
- *
- * for column i, we randomly select cols[i+1] - cols[i] rows
- * as non zero number row index.
- *
- * rows is [1, 3, 4, 0, 2, 4, 1, 2, 3, 4]
- */
-void sparseRand(
-    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu);
-
-/**
- * Calculate output size based on caffeMode_.
- * - input(+padding): 0123456789
- * - imageSize(+padding) = 10;
- * - filterSize = 3;
- * - stride = 2;
- * - caffeMode is true:
-     - output: (012), (234), (456), (678)
-     - outputSize = 4;
- * - caffeMode is false:
- *   - output: (012), (234), (456), (678), (9)
- *   - outputSize = 5;
- */
-int outputSize(
-    int imageSize, int filterSize, int padding, int stride, bool caffeMode);
-
-/**
- * Calculate image size based on output size and caffeMode_.
- * It is the reverse function of outputSize()
- */
-int imageSize(
-    int outputSize, int filterSize, int padding, int stride, bool caffeMode);
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/Matrix.cpp b/paddle/legacy/math/Matrix.cpp
deleted file mode 100644
index e53f95006..000000000
--- a/paddle/legacy/math/Matrix.cpp
+++ /dev/null
@@ -1,4787 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Matrix.h"
-#include "MathFunctions.h"
-#include "SparseMatrix.h"
-#include "SparseRowMatrix.h"
-
-#include <float.h>
-#include <algorithm>
-#include <cmath>
-
-#include <string.h>
-#include "hl_cnn.h"
-#include "hl_gpu.h"
-#include "hl_table_apply.h"
-#include "hl_top_k.h"
-#include "paddle/legacy/utils/Logging.h"
-
-#include "NEONFunctions.h"
-#include "paddle/legacy/function/GemmFunctor.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-#include "SIMDFunctions.h"
-
-namespace paddle {
-
-inline real _pow(real a, real beta) { return std::pow(a, beta); }
-
-inline real _square(real a) { return a * a; }
-
-inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
-
-Matrix::Matrix(MemoryHandlePtr memHandle,
-               size_t height,
-               size_t width,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(
-          height,
-          width,
-          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
-          trans,
-          use_gpu) {
-  elementCnt_ = width * height;
-  memoryHandle_ = memHandle;
-}
-
-Matrix::Matrix(
-    real* data, size_t height, size_t width, bool trans, bool use_gpu)
-    : BaseMatrix(height, width, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-Matrix::Matrix(real* data,
-               size_t height,
-               size_t width,
-               size_t stride,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(height, width, stride, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-MatrixPtr Matrix::createSparseMatrix(real* data,
-                                     int* row,
-                                     int* col,
-                                     size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
-                         size_t height,
-                         size_t width,
-                         bool trans) {
-  if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
-    return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
-  } else if (auto cpuHandle =
-                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
-    return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
-  } else {
-    LOG(FATAL) << "Wrong";
-    return nullptr;
-  }
-}
-
-MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(
-    real* data, size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(real* data,
-                         size_t height,
-                         size_t width,
-                         size_t stride,
-                         bool trans,
-                         bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, stride, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz,
-                                     SparseValueType valueType,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  }
-}
-
-void Matrix::resizeOrCreate(
-    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::create(height, width, trans, useGpu);
-  } else {
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width);
-  }
-}
-
-void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
-                                        size_t height,
-                                        size_t width,
-                                        size_t nnz,
-                                        SparseValueType valueType,
-                                        SparseFormat format,
-                                        bool trans,
-                                        bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::createSparseMatrix(
-        height, width, nnz, valueType, format, trans, useGpu);
-  } else {
-    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width, nnz, valueType, format);
-  }
-}
-
-void Matrix::reshape(size_t height, size_t width) {
-  CHECK(isContiguous());
-  CHECK(height_ * width_ == height * width);
-  height_ = height;
-  width_ = width;
-  stride_ = width_;
-}
-
-MatrixPtr Matrix::subMatrix(size_t startRow,
-                            size_t endRow,
-                            size_t startCol,
-                            size_t endCol) {
-  CHECK_LE(startRow, endRow);
-  CHECK_LE(endRow, getHeight());
-  CHECK_LE(startCol, endCol);
-  CHECK_LE(endCol, getWidth());
-
-  return Matrix::create(getData() + startRow * getStride() + startCol,
-                        endRow - startRow,
-                        endCol - startCol,
-                        getStride(),
-                        trans_,
-                        useGpu_);
-}
-
-void Matrix::setDiag(real value) {
-  CHECK(data_ != NULL);
-  CHECK_EQ(height_, width_);
-
-  zeroMem();
-  BaseMatrix diag(height_, 1, stride_ + 1, data_, false, useGpu_);
-  diag.assign(value);
-}
-
-GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             true) {}
-
-GpuMatrix::~GpuMatrix() {}
-
-void GpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  zero();
-}
-
-void GpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  one();
-}
-
-void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real GpuMatrix::getElement(size_t x, size_t y) const {
-  real elem = 0;
-  hl_memcpy_device2host(&elem, &data_[x * stride_ + y], sizeof(real));
-  return elem;
-}
-
-real GpuMatrix::getSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-real GpuMatrix::getMin() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMin();
-}
-
-real GpuMatrix::getMax() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMax();
-}
-
-void GpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0, 1.0);
-}
-
-real GpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_abs_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-void GpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-
-  if (typeid(src) == typeid(CpuMatrix)) {
-    hl_memcpy_host2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_device2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  hl_memcpy_async(this->getData(),
-                  const_cast<real*>(src.getData()),
-                  sizeof(real) * elementCnt_,
-                  stream);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  hl_memcpy_host2device(data_, const_cast<real*>(hostSrc), sizeof(real) * size);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, const int64_t* seq) {
-  LOG(FATAL) << "not implemented";
-}
-
-void GpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CpuMatrix matrix(src.getSize(), 1, false);
-  matrix.copyFrom(src);
-  copyFrom(matrix);
-}
-
-void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  real* dst = getData();
-  real* src = b.getData();
-  const int* index = rowIndex.getData();
-  hl_sequence2batch_copy(dst, src, index, width, height, true);
-}
-
-MatrixPtr GpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-MatrixPtr GpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    MatrixPtr copy_T(
-        new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-                      height_,
-                      width_,
-                      true));
-    return copy_T;
-  } else {
-    MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
-}
-
-void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-
-  real* dataRot = matRot->getData();
-  real* data = getData();
-  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
-}
-
-MatrixPtr GpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<GpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int lda = getStride();
-  int ldc = matInv->getStride();
-
-  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
-}
-
-void GpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  BaseMatrix::addBias(b, scale);
-}
-
-void GpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  CHECK_LE(b.getWidth(), getWidth());
-  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
-  hl_matrix_add_shared_bias(
-      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
-}
-
-void GpuMatrix::collectBias(Matrix& a, real scale) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
-  if (!sMatPtr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    real* data = getData();
-    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
-    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
-  }
-#endif
-}
-
-void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
-  hl_matrix_collect_shared_bias(
-      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
-}
-
-void GpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_forward(dst, src, starts, height, width, mode);
-}
-
-void GpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
-}
-
-/* this = scaleAB*(a*b) +  scaleT*this */
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  if (!a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.height_);
-  } else if (a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.width_);
-    CHECK_EQ(a.height_, b.height_);
-  } else if (!a.isTransposed() && b.isTransposed()) {
-    CHECK_EQ(width_, b.height_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.width_);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-
-  real* A_d = a.data_;
-  real* B_d = b.data_;
-  real* C_d = data_;
-  int dimM = getHeight();
-  int dimN = getWidth();
-  int dimK = !a.isTransposed() ? a.width_ : a.height_;
-  int lda = a.getStride();
-  int ldb = b.getStride();
-  int ldc = getStride();
-  hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-  hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-
-  hl_matrix_mul(A_d,
-                transa,
-                B_d,
-                transb,
-                C_d,
-                dimM,
-                dimN,
-                dimK,
-                scaleAB,
-                scaleT,
-                lda,
-                ldb,
-                ldc);
-}
-
-void GpuMatrix::mul(const GpuSparseMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(!trans_ && !b.trans_) << "not supported";
-
-  if (!a.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.width_ && height_ == a.width_ && a.height_ == b.height_)
-        << "Matrix dimensions are not equal";
-  }
-  hl_trans_op_t transA = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_sparse_matrix_s A_d = a.sMatrix_.get();
-  real* B_d = b.data_;
-  real* C_d = data_;
-  hl_matrix_csr_mul_dense(A_d,
-                          transA,
-                          B_d,
-                          HPPL_OP_N,
-                          C_d,
-                          height_,
-                          width_,
-                          b.height_,
-                          scaleAB,
-                          scaleT);
-#endif
-}
-
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuSparseMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
-
-  hl_sparse_matrix_s B_d = b.sMatrix_.get();
-  real* A_d = a.data_;
-  real* C_d = data_;
-  hl_trans_op_t transB = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  if (!b.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.height_ && height_ == a.height_ && a.width_ == b.width_)
-        << "Matrix dimensions are not equal";
-  }
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  }
-#endif
-}
-
-/* this = a*b */
-void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); }
-
-void GpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul(*a_ptr_s, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul(*a_ptr, *b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-/* this = this* b */
-void GpuMatrix::rightMul(Matrix& b) { rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void GpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&b));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b.isTransposed()) << "Not supported";
-  mul(*this, *dynamic_cast<GpuMatrix*>(&b), scaleAB, scaleT);
-}
-
-/* this = a*this */
-void GpuMatrix::leftMul(Matrix& a) { leftMul(a, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!a.isTransposed()) << "Not supported";
-  mul(*dynamic_cast<GpuMatrix*>(&a), *this, scaleAB, scaleT);
-}
-
-void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_select_rows(a,
-                        stride_,
-                        table.getData(),
-                        table.stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_add_to_rows(table.getData(),
-                        table.stride_,
-                        a,
-                        stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::colMerge(Matrix& src) {
-  CHECK(src.height_ == height_);
-  if (!trans_ && !src.trans_) {
-    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-}
-
-void GpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void GpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-
-  max.maxRows(*this);
-}
-
-void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  hl_matrix_top_k(maxVal.getData(),
-                  maxVal.getStride(),
-                  maxIds.getData(),
-                  this->getData(),
-                  this->getStride(),
-                  this->getWidth(),
-                  beam,
-                  numSamples);
-#endif
-}
-
-void GpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-
-  max.maxCols(*this);
-}
-
-void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  LOG(FATAL) << "Is not supported";
-}
-
-void GpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  const real* input = a.getData();
-  real* output = getData();
-  int* idForGpu = id.getData();
-
-  hl_maxout_forward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-void GpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  real* input = getData();
-  const real* output = a.getData();
-  const int* idForGpu = id.getData();
-
-  hl_maxout_backward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-/*calulate the error of classification */
-void GpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
-  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
-  size_t numSamples = this->getHeight();
-  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
-  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
-
-  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
-  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
-  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  size_t dim = gpuOutput->getWidth();
-  hl_matrix_classification_error(gpuTopVal->getData(),
-                                 gpuTopVal->getStride(),
-                                 gpuTopIds->getData(),
-                                 gpuOutput->getData(),
-                                 gpuOutput->getStride(),
-                                 dim,
-                                 topkSize,
-                                 numSamples,
-                                 gpuLabel->getData(),
-                                 this->getData());
-}
-
-/* copy -log(output[i * width + label]) to this->data[i] */
-void GpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&output);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == label.getSize() && width_ == 1 && height_ == output.height_)
-      << "Matrix dimensions are not equal";
-
-  real* A_d = output_ptr->data_;
-  real* C_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy(A_d, C_d, label_d, height_, output.width_);
-}
-
-/* calculate the error of outputV according to label */
-void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&outputV);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == output_ptr->height_ && width_ == output_ptr->width_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output_ptr->data_;
-  real* grad_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                 IVector& label,
-                                                 real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::softmax(Matrix& output) {
-  CHECK(output.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == output.getHeight() && width == output.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  hl_matrix_softmax(inputData, outputData, height, width);
-}
-
-void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  auto starts = index.getData();
-  int numSequences = index.getSize() - 1;
-  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
-}
-
-void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == true && sftmaxSum.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  CHECK(height_ == output.height_ && width_ == output.width_ &&
-        height_ == sftmaxSum.height_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output.data_;
-  real* sftmaxSum_d = sftmaxSum.data_;
-  real* grad_d = data_;
-  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
-}
-
-void GpuMatrix::softmaxBackward(Matrix& outputV) {
-  CHECK(outputV.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == outputV.getHeight() && width == outputV.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* output_grad = getData();
-  real* output_value = outputV.getData();
-  hl_softmax_backward(output_value, output_grad, height, width);
-}
-
-void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK_EQ(label.getHeight(), height_);
-  CHECK_EQ(output.getHeight(), height_);
-  CHECK_EQ(label.getWidth(), output.getWidth());
-  CHECK_EQ((size_t)1, width_);
-
-  auto labelptr = dynamic_cast<GpuSparseMatrix*>(&label);
-  if (labelptr) {
-    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-  add2(outputV, label, 1, 2, -2);
-}
-
-void GpuMatrix::tanh(Matrix& output) { BaseMatrix::tanh(output); }
-
-void GpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void GpuMatrix::softrelu(Matrix& output) { BaseMatrix::softrelu(output); }
-
-void GpuMatrix::softreluDerivative(Matrix& output) {
-  BaseMatrix::softreluDerivative(output);
-}
-
-void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  BaseMatrix::scaledTanh(output, p1, p2);
-}
-
-void GpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = data_;
-  size_t size = height_ * width_;
-
-  hl_rand(data, size);
-}
-
-void GpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os);
-}
-
-void GpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os, height, width);
-}
-
-void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  GpuMatrix gpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  gpuRef.copyFrom(*this);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = gpuRef.getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-void GpuMatrix::upsampleForward(Matrix& input,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-  CHECK(input.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = input.getData();
-  real* maskData = mask.getData();
-  real* outData = data_;
-
-  size_t batch = input.getHeight();
-
-  CHECK(imgSizeH * imgSizeW * channels == input.getWidth());
-  CHECK(imgSizeH * imgSizeW * channels == mask.getWidth());
-  CHECK_EQ(batch, this->getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-  hl_upsample_forward(inputData,
-                      maskData,
-                      batch,
-                      imgSizeH,
-                      imgSizeW,
-                      channels,
-                      outputH,
-                      outputW,
-                      outData);
-}
-
-void GpuMatrix::upsampleBackward(Matrix& outputGrad,
-                                 Matrix& mask,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW) {
-  CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
-
-  real* outputGradData = outputGrad.getData();
-  real* maskData = mask.getData();
-  real* inputGradData = data_;
-  size_t batch = outputGrad.getHeight();
-
-  CHECK(imgSizeH * imgSizeW == this->getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth());
-  hl_upsample_backward(outputGradData,
-                       maskData,
-                       batch,
-                       imgSizeH,
-                       imgSizeW,
-                       channels,
-                       outputH,
-                       outputW,
-                       inputGradData);
-}
-
-void GpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* maskData = NULL;
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  if (maskMatP != NULL) {
-    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
-    CHECK(outputH * outputW * channels == maskMatP->getWidth());
-    maskData = maskMatP->getData();
-  }
-
-  hl_maxpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     maskData);
-}
-
-void GpuMatrix::maxPoolBackward(Matrix& inputMat,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
-        outV.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* outData = outV.getData();
-  real* outDiff = outGrad.getData();
-  size_t frameNum = inputMat.getHeight();
-  size_t channels = outV.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(outGrad.getHeight() == outV.getHeight() &&
-        outGrad.getWidth() == outV.getWidth());
-
-  hl_maxpool_backward(frameNum,
-                      inputData,
-                      outData,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride());
-}
-
-void GpuMatrix::avgPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  hl_avgpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     excludeMode);
-}
-
-void GpuMatrix::avgPoolBackward(Matrix& outGrad,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputH * outputW * channels);
-
-  hl_avgpool_backward(frameNum,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride(),
-                      excludeMode);
-}
-
-void GpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
-
-  real* inputData = inputMat.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_maxpool3D_forward(num,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       maxPoolIdxData,
-                       getStride());
-}
-
-void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t frameNum = getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
-  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
-        outGrad.getWidth() == maxPoolIdx.getWidth());
-
-  hl_maxpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        maxPoolIdxData,
-                        outGrad.getStride());
-}
-
-void GpuMatrix::avgPool3DForward(Matrix& inputMat,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_forward(frameNum,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       getStride());
-}
-
-void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        outGrad.getStride());
-}
-
-void GpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&input));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_forward(
-      inputData, starts, outData, maxIndex, numSequences, dim);
-}
-
-void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
-}
-
-void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  CHECK(data.useGpu_ == true && W.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* input = data.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  real* output = getData();
-  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  CHECK(oGrad.useGpu_ == true && data.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_w(
-      wgrad, ograd, input, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_diff(
-      ograd, input, w, diff, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-void GpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&in));
-
-  const size_t outputW = getWidth();
-  const size_t outputH = getHeight();
-  const size_t inputW = in.getWidth();
-  const size_t inputH = in.getHeight();
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgW && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    hl_bilinear_forward(inData,
-                        inImgH,
-                        inImgW,
-                        inputH,
-                        inputW,
-                        outData,
-                        outImgH,
-                        outImgW,
-                        outputH,
-                        outputW,
-                        numChannels,
-                        ratioH,
-                        ratioW);
-  }
-}
-
-void GpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&out));
-
-  const size_t inputW = getWidth();
-  const size_t inputH = getHeight();
-  const size_t outputW = out.getWidth();
-  const size_t outputH = out.getHeight();
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (outImgH == inImgH && outImgW == inImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    hl_bilinear_backward(inGrad,
-                         inImgH,
-                         inImgW,
-                         inputH,
-                         inputW,
-                         outGrad,
-                         outImgH,
-                         outImgW,
-                         outputH,
-                         outputW,
-                         numChannels,
-                         ratioH,
-                         ratioW);
-  }
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* entropy_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy(
-      output_d, entropy_d, mat_d, height_, outputPtr->width_);
-#endif
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* grad_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy_bp(
-      output_d, grad_d, mat_d, height_, width_);
-#endif
-}
-
-void GpuMatrix::vol2Col(real* dataSrc,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  hl_matrix_vol2Col(dataSrc,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData());
-}
-
-void GpuMatrix::col2Vol(real* dataDst,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  hl_matrix_col2Vol(dataDst,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData(),
-                    alpha,
-                    beta);
-}
-
-/**
- * CpuMatrix
- */
-
-CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             false) {}
-
-CpuMatrix::~CpuMatrix() {}
-
-void CpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  if (isContiguous()) {
-    memset(data_, 0, height_ * width_ * sizeof(real));
-  } else {
-    BaseMatrix::zero();
-  }
-}
-void CpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  BaseMatrix::one();
-}
-
-void CpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    hl_memcpy_device2host(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuMatrix) ||
-             typeid(src) == typeid(SharedCpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuSparseMatrix)) {
-    CHECK_GE(elementCnt_, src.getElementCnt());
-    copyFrom(dynamic_cast<CpuSparseMatrix&>(const_cast<Matrix&>(src)));
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(CpuSparseMatrix& src) {
-  CHECK(isContiguous());
-  CHECK(height_ == src.getHeight());
-  CHECK(width_ == src.getWidth());
-  memset(data_, 0, sizeof(real) * height_ * width_);
-  if (src.getValueType() == FLOAT_VALUE) {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = vals[j];
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = vals[j];
-        }
-      }
-    }
-  } else {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = 1.0;
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = 1.0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_async(this->getData(),
-                    const_cast<real*>(src.getData()),
-                    sizeof(real) * elementCnt_,
-                    stream);
-    // There is a need to add synchronization to ensure that the data is copied.
-    hl_stream_synchronize(stream);
-  } else if (typeid(src) == typeid(CpuMatrix)) {
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  memcpy(data_, cpuSrc, sizeof(real) * size);
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, const int64_t* seq) {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; i++) {
-    memcpy(data_ + i * width_, cpuSrc + seq[i] * width_, sizeof(real) * width_);
-  }
-}
-
-void CpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CHECK(elementCnt_ == src.getSize())
-      << "the src and dst should have same size.";
-  const int* cpuSrc = NULL;
-  IVectorPtr tmp;
-  if (src.useGpu()) {
-    CpuIVector tmp(src.getSize());
-    tmp.copyFrom(src);
-    cpuSrc = tmp.getData();
-  } else {
-    cpuSrc = src.getData();
-  }
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    data_[i] = cpuSrc[i];
-  }
-}
-
-void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  const int* index = rowIndex.getData();
-  for (size_t i = 0; i < height; i++) {
-    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
-    real* src = b.getData() + index[i] * width;
-    real* dst = getData() + i * width;
-    memcpy(dst, src, sizeof(real) * width);
-  }
-}
-
-MatrixPtr CpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-void CpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real CpuMatrix::getElement(size_t x, size_t y) const {
-  return data_[x * stride_ + y];
-}
-
-real CpuMatrix::getSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += data_[i * width_ + j];
-    }
-  }
-  return sum;
-}
-
-void CpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-
-  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
-}
-
-real CpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += fabs(data_[i * width_ + j]);
-    }
-  }
-  return sum;
-}
-
-MatrixPtr CpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    return std::make_shared<CpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        true);
-  } else {
-    MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      dataTrans[j * ldc + i] = data[i * lda + j];
-    }
-  }
-}
-
-void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-  real* dataRot = matRot->getData();
-  real* data = getData();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      if (clockWise) {
-        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
-      } else {
-        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
-      }
-    }
-  }
-}
-
-MatrixPtr CpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<CpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  CHECK_EQ(height_, matInv->getHeight());
-  CHECK_EQ(width_, matInv->getWidth());
-  matInv->copyFrom(*this);
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int ldc = matInv->getStride();
-
-  if (height_ == 1) {
-    CHECK_NE(*data, 0);
-    *dataInv = 1.0 / (*data);
-    return;
-  }
-
-  /* Compute the LU decomposition of the matrix */
-  std::vector<int> ipiv(height_);
-  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
-  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-
-  /* Compute the inverse of the matrix given its LU decompsotion */
-  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-}
-
-void CpuMatrix::upsampleForward(Matrix& input,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-  real* inputData = input.getData();
-  real* maskData = mask.getData();
-  real* outData = data_;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t batch = input.getHeight();
-  CHECK(inLength == input.getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-
-  for (size_t k = 0; k < batch; k++) {
-    for (size_t c = 0; c < channels; c++) {
-      for (size_t i = 0; i < inLength; i++) {
-        size_t out_index = static_cast<int>(maskData[i]);
-        if (out_index >= outLength) {
-          LOG(FATAL) << "upsample index " << out_index << " out of range.";
-        }
-        outData[out_index] = inputData[i];
-      }
-      inputData += inLength;
-      maskData += inLength;
-      outData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::upsampleBackward(Matrix& outputGrad,
-                                 Matrix& mask,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t channels,
-                                 size_t outputH,
-                                 size_t outputW) {
-  real* outputGradData = outputGrad.getData();
-  real* maskData = mask.getData();
-  real* inputGradData = data_;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t batch = outputGrad.getHeight();
-  CHECK(inLength == this->getWidth() / channels);
-  CHECK_EQ(batch, this->getHeight());
-  CHECK_EQ(channels * outLength, outputGrad.getWidth());
-
-  for (size_t k = 0; k < batch; k++) {
-    for (size_t c = 0; c < channels; c++) {
-      for (size_t i = 0; i < inLength; i++) {
-        size_t out_index = static_cast<int>(maskData[i]);
-        if (out_index >= outLength) {
-          LOG(FATAL) << "upsample index " << out_index << " out of range.";
-        }
-        inputGradData[i] = outputGradData[out_index];
-      }
-      inputGradData += inLength;
-      maskData += inLength;
-      outputGradData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  real* inputData = inputMat.getData();
-  real* outData = data_;
-  real* maskData = NULL;
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  if (maskMatP != NULL) {
-    maskData = maskMatP->getData();
-    CHECK_EQ(channels * outLength, maskMatP->getWidth());
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = data_ + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = hstart + sizeY;
-        hstart = hstart < 0 ? 0 : hstart;
-        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = wstart + sizeX;
-          wstart = wstart < 0 ? 0 : wstart;
-          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
-
-          real maxval = -(real)FLT_MAX;
-          int max_index = -1;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              if (maxval < inputData[h * imgSizeW + w]) {
-                maxval = inputData[h * imgSizeW + w];
-                max_index = h * imgSizeW + w;
-              }
-            }
-          }
-
-          outData[ph * outputW + pw] = maxval;
-          if (maskData != NULL) maskData[ph * outputW + pw] = max_index;
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-
-      if (maskData != NULL) maskData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPoolBackward(Matrix& image,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  size_t num = image.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(image.getWidth() == inLength * channels);
-  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
-  CHECK(outV.getHeight() == outGrad.getHeight() &&
-        outV.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = data_;
-  real* inData = image.getData();
-  real* otData = outV.getData();
-  real* otGrad = outGrad.getData();
-
-  size_t outStride = outV.getStride();
-  real* origOutData = otData;
-  real* origOutGrad = otGrad;
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outV.isContiguous()) {
-      otData = origOutData + n * outStride;
-      otGrad = origOutGrad + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtGrad[h * imgSizeW + w] =
-                  scaleTargets * tgtGrad[h * imgSizeW + w] +
-                  scaleOutput * otGrad[ph * outputW + pw] *
-                      (inData[h * imgSizeW + w] == otData[ph * outputW + pw]);
-            }
-          }
-        }
-      }
-      // offset
-      inData += inLength;
-      tgtGrad += inLength;
-      otData += outLength;
-      otGrad += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolForward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = data_;
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          tgtData[ph * outputW + pw] = 0;  // clear
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
-            }
-          }
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-          tgtData[ph * outputW + pw] /= poolSize;
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolBackward(Matrix& input,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  size_t num = input.getHeight();
-  size_t channels = input.getWidth() / outputH / outputW;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolSize;
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  real* inputData = inputMat.getData();
-  real* outData = getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[(i)*outStride + j] = -(real)FLT_MAX;
-      maxPoolIdxData[(i)*outStride + j] = -1;
-    }
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int maxIdx = -1;
-            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  if (maxOutData <
-                      inputData[(d * imgSizeH + h) * imgSizeW + w]) {
-                    maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w];
-                    maxIdx = (d * imgSizeH + h) * imgSizeW + w;
-                  }
-                }
-              }
-            }
-            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
-            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
-          }
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
-        maxPoolIdx.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = getData();
-  real* otGrad = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t outStride = outGrad.getStride();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outGrad.isContiguous()) {
-      otGrad = outGrad.getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            const size_t index = (pd * outputH + ph) * outputW + pw;
-            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
-            tgtGrad[tgtIdx] =
-                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
-          }
-        }
-      }
-      // offset
-      tgtGrad += inLength;
-      otGrad += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DForward(Matrix& input,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = getData();
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-
-            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  tgtData[(pd * outputH + ph) * outputW + pw] +=
-                      inData[(d * imgSizeH + h) * imgSizeW + w];
-                }
-              }
-            }
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
-          }
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DBackward(Matrix& input,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = input.getWidth() / outLength;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
-                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
-                }
-              }
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-/**
- * Input: one or more sequences. Each sequence contains some instances.
- * Output: output size is the number of input sequences (NOT input instances).
- * output[i] is set to max_{for each instance in this sequence}{input[i]}
- */
-void CpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&input));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(starts[numSequences], (int)input.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence, loop for each input instance
-    // (1) first instance: do not need compare, copy value to outV directly
-    for (size_t k = 0; k < dim; ++k) {
-      outData[sequenceId * dim + k] = inputData[starts[sequenceId] * dim + k];
-      maxIndex[sequenceId * dim + k] = starts[sequenceId];
-    }
-    // (2) other instance in same sequence
-    for (int insId = starts[sequenceId] + 1; insId < starts[sequenceId + 1];
-         ++insId) {
-      // insId is the index on all instances
-      for (size_t k = 0; k < dim; ++k) {
-        // for each dim
-        if (inputData[insId * dim + k] > outData[sequenceId * dim + k]) {
-          // update max value and record index
-          outData[sequenceId * dim + k] = inputData[insId * dim + k];
-          maxIndex[sequenceId * dim + k] = insId;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence
-    for (size_t j = 0; j < dim; ++j) {
-      // each dim
-      int insId = maxIndex[sequenceId * dim + j];
-      inputGrad[insId * dim + j] += outGrad[sequenceId * dim + j];
-    }
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += b[i];
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth] * c;
-  }
-}
-
-void CpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-
-  if (scale == 1 && getStride() % 32 == 0) {  // use libaddto
-    // @TODO(yuyang18) Make input addr can be unaligned.
-    // So merge this if and else
-    CHECK_EQ((size_t)aData % 32, 0UL);
-    CHECK_EQ((size_t)bData % 32, 0UL);
-    for (size_t i = 0; i < numSamples; i++) {
-      simd::addTo(aData + i * getStride(), bData, dim);
-    }
-  } else {
-    for (size_t i = 0; i < numSamples; i++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + j] += scale * bData[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t channel = b.getWidth();
-  CHECK_EQ(getWidth() % channel, 0UL);
-  size_t dim = getWidth() / channel;
-
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + c * dim + j] += scale * bData[c];
-      }
-    }
-  }
-}
-
-void CpuMatrix::collectBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
-  if (!aptr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    size_t nnz = aptr->getElementCnt();
-    int* cols = aptr->getCols();
-    real* A = aptr->getValue();
-    real* B = getData();
-    for (size_t i = 0; i < nnz; i++) {
-      B[cols[i]] += scale * A[i];
-    }
-  }
-}
-
-void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  real* B = getData();
-  real* A = a.getData();
-  size_t numSamples = a.getHeight();
-  size_t channel = getWidth();
-  CHECK_EQ(a.getWidth() % channel, 0UL);
-  size_t dim = a.getWidth() / channel;
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        B[c] += scale * A[i * channel * dim + c * dim + j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; i++) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + i * width);
-    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
-    if (mode == 0) {
-      // plain average
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / (real)sequenceLength,
-                      /* scaleDest= */ 1);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / std::sqrt(sequenceLength),
-                      /* scaleDest= */ 1);
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; ++i) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
-    dataMtx->setData(src + i * width);
-    if (mode == 0) {
-      // plain average
-      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->addBias(*dataMtx, 1.0f);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-/* this = scaleAB*(a*b) + scaleT*this*/
-void CpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuSparseMatrix* a,
-                    CpuMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<SparseRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else {
-    return mul(a, b, this, scaleAB, scaleT);
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  size_t a_col, b_col, a_row, b_row;
-  bool a_trans, b_trans;
-  if (!a->isTransposed()) {
-    a_col = a->getWidth();
-    a_row = a->getHeight();
-    a_trans = false;
-  } else {
-    a_col = a->getHeight();
-    a_row = a->getWidth();
-    a_trans = true;
-  }
-  if (!b->isTransposed()) {
-    b_col = b->getWidth();
-    b_row = b->getHeight();
-    b_trans = false;
-  } else {
-    b_col = b->getHeight();
-    b_row = b->getWidth();
-    b_trans = true;
-  }
-
-  CHECK_EQ(a_col, b_row);
-  CHECK_EQ(a_row, getHeight());
-  CHECK_EQ(b_col, getWidth());
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = getData();
-
-  int M = getHeight();
-  int N = getWidth();
-  int K = a_col;
-  int lda = a->getStride();
-  int ldb = b->getStride();
-  int ldc = getStride();
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
-}
-
-void CpuMatrix::mul(
-    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK_EQ(c->getValueType(), FLOAT_VALUE);
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = c->getValue();
-  int* rows = c->getRows();
-  int* cols = c->getCols();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[rowIdx * m + k] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getHeight();
-    CHECK_EQ(m, b->getHeight());
-    CHECK_EQ(b->getWidth(), width);
-    CHECK_EQ(a->getWidth(), height);
-
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + rowIdx] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        int start = c->getRowStartIdx(i);
-        int end = c->getRowStartIdx(i + 1);
-        for (int j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + i] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (!a->isTransposed() && b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getWidth(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getHeight(), width);
-    if (c->getFormat() == SPARSE_CSR) {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[colIdx * m + k];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      LOG(FATAL) << "Not supported csc format "
-                    "when a is not trans and b is trans";
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a,
-                    CpuSparseMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!trans_) << "Not supported";
-  CHECK(!a->isTransposed()) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1);
-
-  // TODO(yuyang18): Maybe bug implementation here
-  CHECK_EQ(scaleAB, static_cast<real>(1.0));
-
-  real* A = a->getData();
-  real* B = b->getValue();
-  real* C = getData();
-  int* rows = b->getRows();
-  int* cols = b->getCols();
-
-  if (scaleT == 0) {
-    zeroMem();
-  }
-  if (b->getFormat() == SPARSE_CSC) {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + j, A + rows[i], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + rows[j], A + i, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  } else {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + cols[i], A + j, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + i, A + cols[j], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::selectRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    selectRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-void CpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    data_[i] += tableData[i * table.getWidth() + idsData[i]];
-  }
-}
-
-void CpuMatrix::addElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    tableData[i * table.getWidth() + idsData[i]] += data_[i];
-  }
-}
-
-// this.row[i] += table.row[ids[i]]
-template <typename TableMatType>
-void CpuMatrix::selectRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(a + i * stride_, table.getRow(index[i]), dim);
-  }
-}
-
-void CpuMatrix::addToRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    addToRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-// table.row[ids[i]] += this.row[i]
-template <typename TableMatType>
-void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(table.getRow(index[i]), a + i * stride_, dim);
-  }
-}
-
-static ThreadLocal<std::vector<const real*>> threadLocalColArray;
-
-template <typename MatBType, typename MatCType>
-void CpuMatrix::mul(
-    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  // TODO(yuyang18): Maybe bug implementation here.
-  CHECK(scaleAB == 1) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "Not supported";
-
-  real* B = b->getData();
-  real* C = c->getData();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  int* cols = a->getCols();
-  real* values = a->getValue();
-
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        auto& colArray = *threadLocalColArray;
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          size_t colNum = end - start;
-          colArray.resize(colNum);
-          for (int j = 0; j < end - start; ++j) {
-            colArray[j] = b->getRow(cols[j + start]);
-          }
-          simd::batchAddTo(c->getRow(i), &colArray[0], colNum, width);
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(i), b->getRow(cols[j]), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(i), b->getRow(cols[j]), values[j], width);
-        }
-      }
-    }
-  } else /*if (a->isTransposed())*/ {
-    size_t m = a->getHeight();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getWidth(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            simd::addTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(cols[j]), b->getRow(i), values[j], width);
-        }
-      }
-    }
-  }
-}
-
-// instantiation mul() called in SparseRowMatrix.cpp
-template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseAutoGrowRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
-                                                           CpuMatrix* b,
-                                                           CacheRowCpuMatrix* c,
-                                                           real scaleAB,
-                                                           real scaleT);
-
-#ifndef PADDLE_MOBILE_INFERENCE
-void SharedCpuMatrix::mul(CpuSparseMatrix* a,
-                          CpuMatrix* b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  CHECK_EQ(scaleAB, 1) << "Not supported";
-  CHECK_EQ(scaleT, 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "not supported";
-
-  real* B = b->getData();
-  real* C = getData();
-  size_t height = getHeight();
-  size_t width = getWidth();
-
-  // get real trans
-  MatrixPtr aTrans;
-  if (a->isTransposed()) {
-    aTrans = a->getTmpSparseMatrix(a->getWidth(), a->getHeight());
-    a->transpose(aTrans, false);
-  }
-  a = dynamic_cast<CpuSparseMatrix*>(aTrans.get());
-
-  size_t m = a->getWidth();
-  CHECK_EQ(b->getHeight(), m);
-  CHECK_EQ(a->getHeight(), height);
-  CHECK_EQ(b->getWidth(), width);
-
-  size_t blockSize = (height / blockNum_) + 1;
-  CpuMatrixPtr localBuf = *localBuf_;
-  if (!localBuf) {
-    localBuf = std::make_shared<CpuMatrix>(blockSize, width);
-  } else {
-    localBuf->resize(blockSize, width);
-  }
-  localBuf->zeroMem();
-  real* localC = localBuf->getData();
-  std::vector<int>& blockSeq = *blockSeq_;
-  if (blockSeq.size() == 0) {
-    for (int k = 0; k < blockNum_; ++k) {
-      blockSeq.push_back(k);
-    }
-    std::shuffle(
-        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
-  }
-  std::vector<int>& localBufRows = *localBufRows_;
-  int* cols = a->getCols();
-  real* value = a->getValue();
-
-  for (int k = 0; k < blockNum_; ++k) {
-    int blockId = blockSeq[k];
-    size_t blockBegin = blockId * blockSize;
-    size_t blockEnd = (blockId + 1) * blockSize;
-    if (blockId == blockNum_ - 1) {
-      blockEnd = height;
-    }
-    if (a->getValueType() == NO_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(localC + bufPos * width, B + cols[j] * width, width);
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(
-              localC + bufPos * width, B + cols[j] * width, value[j], width);
-        }
-      }
-    }
-
-    {
-      std::lock_guard<std::mutex> guard(*blockLocks_[blockId]);
-      for (size_t i = 0; i < localBufRows.size(); ++i) {
-        vecAddTo(C + localBufRows[i] * width, localC + i * width, width);
-      }
-    }
-    memset(localC, 0, localBufRows.size() * width * sizeof(real));
-    localBufRows.clear();
-  }
-
-  VLOG(2) << " B[0]=" << B[0] << " B[1]=" << B[1] << " C[0]=" << C[0]
-          << " C[1]=" << C[1];
-}
-
-void SharedCpuMatrix::add(Matrix& b, real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(b, p1, p2);
-}
-
-void SharedCpuMatrix::add(real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(p1, p2);
-}
-
-void SharedCpuMatrix::initShared(int blockNum) {
-  CHECK_GT(height_ * width_, 1UL * 1024 * 1024)
-      << "should not share small matrix";
-  initBlock(blockNum);
-}
-
-void SharedCpuMatrix::initBlock(int blockNum) {
-  CHECK_LE(blockNum, 200) << "should not use large block number";
-  blockNum_ = blockNum;
-  blockLocks_.resize(blockNum);
-  for (auto& locker : blockLocks_) {
-    locker.reset(new std::mutex);
-  }
-}
-
-#endif
-/* Add a (column) vector b to matrix a, column by column */
-void CpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-/* this = a*b */
-void CpuMatrix::mul(const Matrix& a, const Matrix& b) {
-  return mul(a, b, 1.0, 0.0);
-}
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void CpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  (void)b;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = this* b */
-void CpuMatrix::rightMul(Matrix& b) { return rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void CpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  (void)a;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = a*this) */
-void CpuMatrix::leftMul(Matrix& a) { return leftMul(a, 1.0, 0.0); }
-
-void CpuMatrix::colMerge(Matrix& src) { src.rowSum(*this); }
-
-void CpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void CpuMatrix::rowMaxId(IVector& maxIds) {
-  CHECK(!maxIds.useGpu()) << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  size_t dim = getWidth();
-
-  for (size_t i = 0; i < numSamples; i++) {
-    real sm = a[i * dim];
-    int maxId = 0;
-    for (size_t j = 1; j < dim; j++) {
-      if (a[i * dim + j] > sm) {
-        maxId = j;
-        sm = a[i * dim + j];
-      }
-    }
-    s[i] = maxId;
-  }
-}
-
-void CpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-  max.maxRows(*this);
-}
-
-/* Get the top k elements of each row of this matrix */
-void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getWidth();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i * dim + j], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i * beam + j] = vec[j].first;
-      s[i * beam + j] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-  max.maxCols(*this);
-}
-
-void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getWidth();
-  size_t beam = maxVal.getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getWidth(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getHeight();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i + j * numSamples] = vec[j].first;
-      s[i + j * numSamples] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  const real* input = a.getData();
-  int* idForCpu = id.getData();
-
-  MatrixPtr maxInMat, maxOutMat;
-  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
-  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
-
-    for (size_t i = 0; i < channels; ++i) {
-      size_t newFeatLen = i * featLen;
-      for (size_t j = 0; j < groups; ++j) {
-        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
-            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
-                       featLen);
-      }
-    }
-    maxInMat->colMax(*tmpId, *maxOutMat);
-    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
-  }
-}
-
-void CpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  size_t newFeatLen = groups * featLen;
-  real* inputG = getData();
-  const real* outG = a.getData();
-  int* idForCpu = id.getData();
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    int* idData = idForCpu + newIndex;
-
-    for (size_t i = 0; i < size; ++i) {
-      int gradIdx =
-          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
-      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
-    }
-  }
-}
-
-void CpuMatrix::rowNormalizeL1(Matrix& out) {
-  CHECK(!out.useGpu());
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(out.getHeight(), numSamples);
-  CHECK_EQ(out.getWidth(), dim);
-  real* a = getData();
-  real* b = out.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    real s = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      s += a[i * dim + j];
-    }
-    // Right now, we just bet that sum won't be zero. If this really happens,
-    // we will figure out what should be done then.
-    CHECK_GT(s, 0);
-    s = 1 / s;
-    for (size_t j = 0; j < dim; ++j) {
-      b[i * dim + j] = s * a[i * dim + j];
-    }
-  }
-}
-
-/* calulate classification error */
-void CpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  size_t numSamples = this->getHeight();
-  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
-  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
-  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
-  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
-
-  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
-  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
-  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  // top k matrix classification
-  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
-
-  size_t dim = cpuOutput->getWidth();
-  real* result = this->getData();
-  int* ids = cpuTopIds->getData();
-  int* lbl = cpuLabel->getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-
-    for (size_t j = 0; j < topkSize; ++j) {
-      if (ids[j + i * topkSize] == lbl[i]) {
-        result[i] = 0;
-        break;
-      }
-      result[i] = 1.0f;
-    }
-  }
-}
-
-/* copy -log(output[label]) to this->data[i] */
-void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    cost[i] = -std::log(out[lbl[i]]);
-  }
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = _safelog(sum);
-    cost[i] = -_safelog(out[lbl[i]]) + sum + alpha * _square(sum);
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
-                                                 IVector& label,
-                                                 real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      if (j == (size_t)lbl[i]) {
-        grad[j] += -1 / out[j];
-      }
-      grad[j] += 1 / sum + 2 * alpha * _safelog(sum) / sum;
-    }
-  }
-}
-
-#define FORWARD_LOOP()                      \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  const real* in = getData();               \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, in += dim, out += dim)
-
-#define BACKWARD_LOOP()                     \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  real* grad = getData();                   \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, grad += dim, out += dim)
-
-void CpuMatrix::softmax(Matrix& output) {
-  CHECK(!output.useGpu());
-
-  const float THRESHOLD = -64.0;
-
-  FORWARD_LOOP() {
-    real max = -1.0e20;
-    for (size_t j = 0; j < dim; ++j) {
-      if (in[j] > max) {
-        max = in[j];
-      }
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      real a = in[j] - max;
-      if (a < THRESHOLD) {
-        a = THRESHOLD;
-      }
-      out[j] = a;
-    }
-    vExp(dim, out, out);
-
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = 1 / sum;
-    for (size_t j = 0; j < dim; ++j) {
-      out[j] *= sum;
-    }
-  }
-}
-
-void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  MatrixPtr inTmp = Matrix::create(nullptr,
-                                   /* height= */ 1,
-                                   1,
-                                   /* trans= */ false,
-                                   false);
-  MatrixPtr outTmp = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    false);
-  size_t numSequences = index.getSize() - 1;
-  auto starts = index.getData();
-  for (size_t i = 0; i < numSequences; ++i) {
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    inTmp->setData(getData() + offset, 1UL, size);
-    outTmp->setData(output.getData() + offset, 1UL, size);
-    inTmp->softmax(*outTmp);
-  }
-}
-
-void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(getHeight(), sftmaxSum.getHeight());
-
-  real* sums = sftmaxSum.getData();
-
-  BACKWARD_LOOP() {
-    real sum = sums[i];
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] = out[j] * (grad[j] - sum);
-    }
-  }
-}
-
-void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-  real* out = output.getData();
-  real* cost = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          cost[i] += _square(out[i * dim + j]);
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(1.0 - out[i * dim + feature.col]);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          real sum1 = 0;
-          real sum2 = 0;
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            sum1 += values[j] * values[j];
-            sum2 += values[j] * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(value.col - out[i * dim + feature.col]);
-             */
-          }
-          cost[i] += sum1 - 2.0 * sum2;
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  CHECK_EQ(label.getWidth(), dim);
-
-  real* out = output.getData();
-  real* grad = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          grad[i * dim + j] += 2.0 * out[i * dim + j];
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0;
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - 1);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0 * values[j];
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - value.col);
-             */
-          }
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  real* lbl = label.getData();
-  size_t ld = getStride();
-  size_t outLd = output.getStride();
-  size_t lblLd = label.getStride();
-  CHECK(lbl);
-  for (size_t i = 0; i < numSamples;
-       ++i, out += outLd, lbl += lblLd, grad += ld) {
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] += 2.0 * (out[j] - lbl[j]);  // positive gradient;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* cost = getData();
-  real* out = output.getData();
-  real* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real absVal = std::fabs(out[j] - lbl[j]);
-      cost[i] *= destScale;
-      if (absVal < 1.0)
-        cost[i] += 0.5 * absVal * absVal;
-      else
-        cost[i] += absVal - 0.5;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), dim);
-
-  real* out = output.getData();
-  real* lbl = label.getData();
-  real* grad = getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real val = out[j] - lbl[j];
-      grad[j] *= destScale;
-      if (std::fabs(val) < 1) {
-        grad[j] += val;
-      } else {
-        grad[j] += (real(0) < val) - (val < real(0));
-      }
-    }
-  }
-}
-
-void CpuMatrix::tanh(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  vTanh(numSamples * dim, getData(), output.getData());
-}
-
-void CpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void CpuMatrix::softrelu(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  const real THRESHOLD = 40.0;
-  FORWARD_LOOP() {  // TODO(yuyang18): SIMD it?
-    for (size_t j = 0; j < dim; ++j) {
-      real x = in[j];
-      if (x > THRESHOLD) {
-        x = THRESHOLD;
-      } else if (x < -THRESHOLD) {
-        x = -THRESHOLD;
-      }
-      out[j] = x;
-    }
-  }
-  vExp(numSamples * dim, output.getData(), output.getData());
-  vLog1p(numSamples * dim, output.getData(), output.getData());
-}
-
-void CpuMatrix::softreluDerivative(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  size_t size = numSamples * dim;
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  real* grad = getData();
-  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
-  real* tmp = tmpMat->getData();
-
-  vExp(size, output.getData(), tmpMat->getData());
-
-  for (size_t i = 0; i < size; ++i) {
-    grad[i] *= (1.0 - 1.0 / tmp[i]);
-  }
-}
-
-void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-
-  const real* in = getData();
-  real* out = output.getData();
-
-  // out = p2*in
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p2 * in[i];
-  }
-
-  vTanh(numSamples * dim, out, out);
-
-  // out = p1 * out
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p1 * out[i];
-  }
-}
-
-/* uniform randomization, minimize precision = 1e-5 */
-void CpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = getData();
-  unsigned int* randSeed = ThreadLocalRand::getSeed();
-  real recipRandMax = 1.0f / (real)RAND_MAX;
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    *data++ = rand_r(randSeed) * recipRandMax;
-  }
-}
-
-void CpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  real* input = data.getData();
-  real* w = W.getData();
-  real* output = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-
-  size_t partial_sum = numElements / paraSize;
-  if (paraSize == numElements) {
-    for (size_t n = 0; n < numSamples * numElements; ++n) {
-      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
-    }
-    return;
-  }
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  for (size_t n = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < paraSize; i++) {
-      neon::prelu(
-          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
-    }
-    input = input + numElements;
-    output = output + numElements;
-  }
-#else
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
-    }
-  }
-#endif
-}
-
-void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
-    }
-  }
-}
-
-void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
-    }
-  }
-}
-
-void CpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  size_t h = height_ < height ? height_ : height;
-  size_t w = width_ < width ? width_ : width;
-  os.setf(std::ostream::scientific);
-  os << "[";
-  for (size_t i = 0; i < h; ++i) {
-    for (size_t j = 0; j < w; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    if (i == h - 1) {
-      os << "]";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  size_t offset = idx * stride_;
-  os << data_[offset];
-  for (size_t i = 1; i < width_; ++i) {
-    os << " " << data_[offset + i];
-  }
-  os << ";";
-}
-
-void CpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-real CpuMatrix::getMin() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res > data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-real CpuMatrix::getMax() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res < data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
-  size_t height = this->getHeight();
-  size_t width0 = this->getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in0.getHeight());
-  CHECK_EQ(width0, in0.getWidth());
-  CHECK_EQ(height, in1.getHeight());
-
-  CHECK_EQ(width1 % 2, 1U);
-
-  real* outV = this->getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height;
-       ++x, outV += width0, inV0 += width0, inV1 += width1) {
-    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
-      for (size_t j = 0; j < width1; ++j) {
-        // iterate over all dimentions of inV1
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        outV[i] += inV0[index] * inV1[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::circularConvDerivative(
-    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
-  size_t height = in0.getHeight();
-  size_t width0 = in0.getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in1.getHeight());
-  CHECK_EQ(height, inG0.getHeight());
-  CHECK_EQ(width0, inG0.getWidth());
-  CHECK_EQ(height, inG1.getHeight());
-  CHECK_EQ(width1, inG1.getWidth());
-  CHECK_EQ(height, outG.getHeight());
-  CHECK_EQ(width0, outG.getWidth());
-
-  real* outGV = outG.getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-  real* inGV0 = inG0.getData();
-  real* inGV1 = inG1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height; ++x,
-              outGV += width0,
-              inV0 += width0,
-              inV1 += width1,
-              inGV0 += width0,
-              inGV1 += width1) {
-    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
-      for (size_t i = 0; i < width0; ++i) {
-        // such over all dimensions of outG
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        inGV0[index] += outGV[i] * inV1[j];
-        inGV1[j] += outGV[i] * inV0[index];
-      }
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* cost = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      cost[i] -= std::log(1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      cost[i] -= std::log(out[cols[j]] / (1 - out[cols[j]]));
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, output.getWidth());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* grad = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      grad[j] += 1.0 / (1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      grad[cols[j]] -= 1.0 / (out[cols[j]] * (1 - out[cols[j]]));
-    }
-  }
-}
-
-/* calculate the classification error for multi binary label */
-void CpuMatrix::classificationErrorMulti(Matrix& output,
-                                         Matrix& label,
-                                         real threshold) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* result = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    real sum = 0.0;
-    for (size_t j = 0; j < dim; ++j) {
-      if (out[j] >= threshold) {
-        sum += 1.0;
-      }
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      if (out[cols[j]] < threshold) {
-        sum += 1.0;
-      } else {
-        sum -= 1.0;
-      }
-    }
-    result[i] = sum / dim;
-  }
-}
-
-void CpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&in));
-
-  size_t outputW = getWidth();
-  size_t batchSize = getHeight();
-  size_t inputW = in.getWidth();
-  size_t inputH = in.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-          // calculate four position for bilinear interpolation
-          const real* inPos = &inData[k * inputW + h * inImgW + w];
-          real* outPos = &outData[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            // bilinear interpolation
-            outPos[0] =
-                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
-                h1lambda * (w2lambda * inPos[hid * inImgW] +
-                            w1lambda * inPos[hid * inImgW + wid]);
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&out));
-
-  size_t inputW = getWidth();
-  size_t inputH = getHeight();
-  size_t outputW = out.getWidth();
-  size_t batchSize = out.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-
-          real* inPos = &inGrad[k * inputW + h * inImgW + w];
-          const real* outPos = &outGrad[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            inPos[0] += h2lambda * w2lambda * outPos[0];
-            inPos[wid] += h2lambda * w1lambda * outPos[0];
-            inPos[hid * inImgW] += h1lambda * w2lambda * outPos[0];
-            inPos[hid * inImgW + wid] += h1lambda * w1lambda * outPos[0];
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::vol2Col(real* data,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  real* outData = getData();
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIn = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
-                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
-          else
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::col2Vol(real* trg,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  real* src = getData();
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIm = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
-                alpha *
-                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
-                beta *
-                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////
-//               functions executed via cpu                   //
-////////////////////////////////////////////////////////////////
-
-void GpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  execViaCpu2(&CpuMatrix::selectElements, *this, table, ids);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/math/Matrix.h b/paddle/legacy/math/Matrix.h
deleted file mode 100644
index ff4f4cfc2..000000000
--- a/paddle/legacy/math/Matrix.h
+++ /dev/null
@@ -1,2189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <memory>
-#include <thread>
-
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "Vector.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
-enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
-
-/**
- * @brief  matrix sparse_format .
- *
- * nnz represents nonzero number in sparse matrix.
- *
- * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
- * represents row start index in Matrix. length of col and value are nnz.
- *
- * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
- * represents col start index in Matrix. length of col and value are nnz.
- *
- * @code
- * for example: [0, 1, 0, 2, 0;
- *               1, 0, 0, 0, 0;
- *               0, 0, 0, 2, 5];
- * SPARSE_CSR row   [0, 2, 3, 5];
- *            col   [1, 3, 0, 3, 4];
- *            value [1, 2, 1, 2, 5]
- * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
- *            row   [1, 0, 0, 2, 2];
- *            value [1, 1, 2, 2, 5]
- * @endcode
- */
-/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
-enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-class Matrix;
-class GpuMatrix;
-class CpuMatrix;
-class CpuSparseMatrix;
-class GpuSparseMatrix;
-typedef std::shared_ptr<Matrix> MatrixPtr;
-typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
-typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
-typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
-typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-class Matrix : public BaseMatrix {
- protected:
-  Matrix(MemoryHandlePtr memHandle,
-         size_t height,
-         size_t width,
-         bool trans,
-         bool use_gpu);
-
-  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
-
-  Matrix(real* data,
-         size_t height,
-         size_t width,
-         size_t stride,
-         bool trans,
-         bool use_gpu);
-
-  static ThreadLocal<MatrixPtr> tmpMat_;
-
- public:
-  size_t elementCnt_;  // maximal number of elements which can be held in data_
-  MemoryHandlePtr memoryHandle_;
-
- public:
-  virtual ~Matrix() {}
-
-  static MatrixPtr create(MemoryHandlePtr memHandle,
-                          size_t height,
-                          size_t width,
-                          bool trans = false);
-  static MatrixPtr create(size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          size_t stride,
-                          bool trans = false,
-                          bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false,
-                                      bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false,
-                                      bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data,
-                                      int* row,
-                                      int* col,
-                                      size_t height,
-                                      size_t width,
-                                      size_t nnz, /* used to allocate space */
-                                      SparseValueType valueType, /*value type*/
-                                      SparseFormat format,
-                                      bool trans,
-                                      bool useGpu);
-
-  static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix,
-      size_t height,
-      size_t width,
-      size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE,
-      SparseFormat foramt = SPARSE_CSR,
-      bool trans = false,
-      bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a,
-                             size_t height,
-                             size_t width,
-                             bool trans = false,
-                             bool useGpu = false);
-
-  /**
-   * @brief  set the data buffer used to hold the matrix data.
-   *
-   * caller should make sure that the size of data is at least
-   * sizeof(real)*height*width.
-   */
-  void setData(real* data) {
-    BaseMatrix::setData(data);
-    memoryHandle_.reset();
-  }
-
-  /// the data should be contiguous
-  void setData(real* data, size_t newHeight, size_t newWidth) {
-    setData(data);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-  }
-
-  size_t getWidth() const { return width_; }
-  size_t getHeight() const { return height_; }
-  size_t getStride() const { return stride_; }
-  size_t getElementCnt() const { return elementCnt_; }
-  virtual real* getData() { return data_; }
-  virtual const real* getData() const { return data_; }
-  bool isTransposed() const { return trans_; }
-  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-
-  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
-  // befor call the following functions.
-  // Declare these functions in the base class just easy to call them.
-  // And these declarations should be moved to base class of sparse matrix
-  // if refactor sparse matrix
-  virtual int* getRows() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual int* getCols() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual SparseFormat getFormat() const {
-    LOG(FATAL) << "Not implemented";
-    return SPARSE_CSR;  //! suppress warning for no return value.
-  }
-
-  virtual SparseValueType getValueType() const {
-    LOG(FATAL) << "Not implemented";
-    return NO_VALUE;  //! suppress warning for no return value.
-  }
-
-  /**
-   * @brief matrix elment-wise add
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   */
-  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
-
-  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
-
-  void setDiag(real value);
-
-  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void trimFrom(const CpuSparseMatrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  // For GpuMatrix this is an asynchronous copy interface
-  // For CpuMatrix this is an synchronous copy interface
-  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  MatrixPtr subMatrix(size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol);
-
-  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
-    return subMatrix(startRow, endRow, 0, getWidth());
-  }
-
-  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
-    return subMatrix(0, getHeight(), startCol, endCol);
-  }
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
-    CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(),
-                          numRows,
-                          getWidth(),
-                          trans_,
-                          useGpu_);
-  }
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
-    CHECK_LE(startRow + numRows, getHeight());
-    CHECK_EQ(useGpu_, dest->useGpu_);
-    dest->setData(this->rowBuf(startRow), numRows, getWidth());
-    return dest;
-  }
-
-  /**
-   * If this is GpuMatrix, src is assumed to be CPU memory
-   *
-   * If this is CpuMatrix, src is assumed to be CPU memory
-   */
-  virtual void copyFrom(const real* src, size_t size) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void copyFrom(const real* src, const int64_t* seq) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief convert a int vector to a real matrix.
-   *
-   * (1) source and dest are both in CPU.
-   *
-   * (2) sizes are exactly match.
-   */
-  virtual void copyFrom(const IVector& src) {
-    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
-  }
-
-  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
-   *        NonValueSparseMatrix, etc.) as this.
-   *
-   * If height and width is zero, the new matrix will have the same size
-   * as this, otherwise the new matrix will have the specified size.
-   *
-   */
-  virtual MatrixPtr clone(size_t height = 0,
-                          size_t width = 0,
-                          bool useGpu = false) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real* getRowBuf(size_t row) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real getElement(size_t x, size_t y) const {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual real getSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void accumulateColSum(Matrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual real getAbsSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  /**
-   * @note Original data may not be preserved after resize().
-   */
-  virtual void resize(size_t newHeight, size_t newWidth) = 0;
-
-  /**
-   * @note This should only be used for sparse matrix.
-   */
-  virtual void resize(size_t newHeight,
-                      size_t newWidth,
-                      size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType,
-                      SparseFormat format) = 0;
-
-  /**
-   * @brief This should only be used for sparse matrix.
-   *
-   * Currently must be called for each row in order.
-   * The matrix is not valid until setRow is called for the last row.
-   */
-  virtual void setRow(size_t row,
-                      size_t colNum,
-                      const unsigned int* cols,
-                      const real* values) = 0;
-
-  virtual MatrixPtr getTranspose() = 0;
-
-  /**
-   * @brief  hard transpose.
-   *
-   * allocate matTrans' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
-   *         otherwise rotate in anti clock-wise
-   * clock-wise:
-   * \f[
-   *   y(j,i) = x(M-i-1,j)
-   * \f]
-   * anti clock-wise:
-   * \f[
-   *   y(j,i) = x(i, N-1-j)
-   * \f]
-   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
-   *
-   * allocate matRot' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual MatrixPtr getInverse() {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  /**
-   * @brief  inverse.
-   *
-   * if allocate matInv's memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
- public:
-  /// Only set all variables to 0 or NULL but not free them.
-  virtual void clear() {
-    height_ = 0;
-    width_ = 0;
-    data_ = NULL;
-  }
-
-  void reshape(size_t height, size_t width);
-
-  /// add b to each sample of this.
-  virtual void addBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void addSharedBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void addBias(Matrix& b, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      addBias(b, scale);
-    } else {
-      addSharedBias(b, scale);
-    }
-  }
-
-  /// add each sample from a to this.
-  virtual void collectBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void collectSharedBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void collectBias(Matrix& a, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      collectBias(a, scale);
-    } else {
-      collectSharedBias(a, scale);
-    }
-  }
-
-  virtual void sequenceAvgForward(Matrix& a,
-                                  const IVector& startsPos,
-                                  int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void sequenceAvgBackward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  virtual void mul(const Matrix& a,
-                   const Matrix& b,
-                   real scaleAB,
-                   real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// Add a vector (column) b to matrix a, column by column.
-  virtual void addColumnVector(const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += vec(index(i, j), 0)
-   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
-   * @endcode
-   */
-  virtual void addByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   vec(index(i, j), 0) += this(i, j)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void addByBitCodeBackward(size_t numClasses,
-                                    const IVector& codes,
-                                    Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& mat,
-                            const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes,
-                                          Matrix& mat,
-                                          const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   input.row(i) += this(i, j) * mat.row(index(i, j))
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardError(size_t numClasses,
-                                         const IVector& codes,
-                                         const Matrix& mat,
-                                         Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
-   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
-   * @endcode
-   */
-  virtual void sumByBitCode(size_t numClasses,
-                            IVector& codes,
-                            Matrix& sum,
-                            real scaleSum) {
-    (void)numClasses;
-    (void)codes;
-    (void)sum;
-    (void)scaleSum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *  this(i, j) -= bit(i, j)
-   * where bit(i, j) is same as that for sumByBitCode
-   * @endcode
-   */
-  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
-    (void)numClasses_;
-    (void)codes;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * add the sum of each row of this to mat
-   */
-  virtual void rowSum(Matrix& sum) {
-    (void)sum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each row of this to mat
-   */
-  virtual void rowMax(Matrix& max) {
-    (void)max;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each column of this to mat
-   */
-  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each column of this matrix.
-   *
-   * The row ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutForward(Matrix& a,
-                             IVector& id,
-                             size_t channels,
-                             size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutBackward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each row of this matrix.
-   *
-   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void rowMax(IVector& maxIds, Matrix& max) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// normalize each row so that the sum of each row is 1.
-  virtual void rowNormalizeL1(Matrix& out) {
-    (void)out;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   *  this = a*b
-   * @endcode
-   */
-  virtual void mul(const Matrix& a, const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = a*this)
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
-
-  /// merge the element for each col.
-  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                              IVector& label,
-                                              real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                IVector& label,
-                                                real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * \f[
-   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
-   * \f]
-   *
-   * b contains M elements,
-   * c contains N elements (N is odd),
-   * b's index arithmetic is computed modulo M,
-   * c's index arithmetic is computed modulo N.
-   */
-  virtual void circularConv(Matrix& b, Matrix& c) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void circularConvDerivative(Matrix& output,
-                                      Matrix& prevOut1,
-                                      Matrix& prevOut2,
-                                      Matrix& prevGrad1,
-                                      Matrix& prevGrad2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
-  virtual void softmax(Matrix& output) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void softmaxBackward(Matrix& outputV) {
-    (void)outputV;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /*
-    sum_i = sum_j this_ij * output_ij
-    this_ij = output_ij* (this_ij - sum_i)
-  */
-  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the sum of squares diff cost.
-  virtual void sumOfSquares(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// gradient of sumOfSquares.
-  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void tanhDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void softreluDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void scaledTanh(Matrix& output, real p1, real p2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print out the values of elements to os
-  virtual void print(std::ostream& os) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * print a part of the matrix
-   * from the (top,left) value to the (height, width) value (not included)
-   */
-  virtual void print(std::ostream& os, size_t height, size_t width) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print one row to os
-  virtual void printOneRow(std::ostream& os, size_t idx) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
-
-  virtual real getMin() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-  virtual real getMax() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief  calulate the error of classification
-   *
-   * output[i] = 1 if row i is an error.
-   *
-   * output[i] = 0 if row i is correct.
-   *
-   */
-  virtual void classificationError(Matrix& output,
-                                   IVector& label,
-                                   size_t topkSize = 1) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void upsampleForward(Matrix& input,
-                               Matrix& mask,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t outputH,
-                               size_t outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void upsampleBackward(Matrix& outputGrad,
-                                Matrix& mask,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t channels,
-                                size_t outputH,
-                                size_t outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value, if the maskMatP is not NULL, it will
-   * also caculate the location indices.
-   */
-  virtual void maxPoolForward(Matrix& inputMat,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              MatrixPtr maskMatP = NULL) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               Matrix& outGrad,
-                               Matrix& outV,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPoolBackward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling 3D forward operation, pick out the largest element
-   * in the sizeX of value
-   */
-  virtual void maxPool3DForward(Matrix& inputMat,
-                                Matrix& maxPoolIdx,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxPool3DBackward(Matrix& outGrad,
-                                 Matrix& maxPoolIdx,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DForward(Matrix& input,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DBackward(Matrix& input,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
- * Input: one or more sequences. Each sequence contains some instances.
- *
- * Output: output size is the number of input sequences (NOT input
- * instances).
- *
- * output[i] is set to max_input[i].
- */
-  virtual void maxSequenceForward(Matrix& input,
-                                  const IVector& sequence,
-                                  IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxSequenceBackward(Matrix& outputGrad,
-                                   const IVector& sequence,
-                                   IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-  /**
-   * @brief  cross entropy for multi binary labels
-   *
-   * @code
-   * this[i] = -sum(label[i][j]*log(output[i][j])
-   *           + (1-label[i][j])*log(1-output[i][j]))
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  The gradient of cross entropy for multi binary labels on output
-   *
-   * @code
-   * this[i][j] = -label[i][j]/output[i][j]
-   *              + (1-label[i][j])/(1-output[i][j])
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  Calculate the classification error for multi binary labels
-   *
-   * @code
-   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
-   *            || (output[i][j] < threshold && label[i][j] == 1))
-   *            / output->getWidth()
-   * @endcode
-   */
-  virtual void classificationErrorMulti(Matrix& output,
-                                        Matrix& label,
-                                        real threshold) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void paramReluForward(Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void vol2Col(real* data,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void col2Vol(real* trg,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       real alpha,
-                       real beta) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void bilinearForward(const Matrix& in,
-                               const size_t inImgH,
-                               const size_t inImgW,
-                               const size_t outImgH,
-                               const size_t outImgW,
-                               const size_t numChannels,
-                               const real ratioH,
-                               const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void bilinearBackward(const Matrix& out,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<real>(*this, expr);
-    } else {
-      TensorCpuApply<real>(*this, expr);
-    }
-  }
-
-  bool isEmpty() const { return data_ == nullptr; }
-
-  explicit operator bool() const { return !isEmpty(); }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
-  mat.print(os);
-  return os;
-}
-
-class GpuMatrix : public Matrix {
- public:
-  GpuMatrix();
-
-  GpuMatrix(size_t height, size_t width, bool trans = false);
-  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, true) {}
-  ~GpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  /**
-   * Copy the data from cpu_memory buffer
-   */
-  void copyFrom(const real* hostSrc, size_t size);
-
-  void copyFrom(const real* hostSrc, const int64_t* seq);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const IVector& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  real getElement(size_t x, size_t y) const;
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  real getMin();
-  real getMax();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /**
-   * @code
-   * add each sample from a to this.
-   * @endcode
-   */
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*b
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-
-  void mul(const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  void mul(const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  void rightMul(Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*this
-   * @endcode
-   */
-  void leftMul(Matrix& a);
-
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& max);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& max);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxBackward(Matrix& outputV);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  virtual void print(std::ostream& os) const;
-  virtual void print(std::ostream& os, size_t height, size_t width) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void upsampleForward(Matrix& input,
-                       Matrix& mask,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t channels,
-                       size_t outputH,
-                       size_t outputW);
-
-  void upsampleBackward(Matrix& outputGrad,
-                        Matrix& mask,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t channels,
-                        size_t outputH,
-                        size_t outputW);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<real>(*this, expr);
-  }
-};
-
-class CpuMatrix : public Matrix {
- private:
-  MatrixPtr sftmaxSum_;
-  MatrixPtr sftmaxDot_;
-
- public:
-  CpuMatrix(size_t height, size_t width, bool trans = false);
-  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, false) {}
-
-  CpuMatrix(CpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, false) {}
-
-  ~CpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  real getElement(size_t x, size_t y) const;
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const real* cpuSrc, size_t size);
-
-  void copyFrom(const real* cpuSrc, const int64_t* seq);
-
-  void copyFrom(const IVector& src);
-
-  void copyFrom(CpuSparseMatrix& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  void upsampleForward(Matrix& input,
-                       Matrix& mask,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t channels,
-                       size_t outputH,
-                       size_t outputW);
-
-  void upsampleBackward(Matrix& outputGrad,
-                        Matrix& mask,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t channels,
-                        size_t outputH,
-                        size_t outputW);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
- public:
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /// add each sample of a to this.
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids);
-
-  /**
-   * use abstract getRow() to get row from table.
-   *
-   * Define table as template instead of virtual class for performance sake.
-   * internal used by above two virtual funcs.
-   */
-  template <typename TableMatType>
-  void selectRowsImp(TableMatType& table, IVector& ids);
-  template <typename TableMatType>
-  void addToRowsImp(TableMatType& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
-
-  static void mul(CpuMatrix* a,
-                  CpuMatrix* b,
-                  CpuSparseMatrix* c,
-                  real scaleAB,
-                  real scaleT);
-
-  /**
-   * c = a * b
-   *
-   * use abstract getRow() to get row from B,C.
-   * Define B,C as template instead of virtual class for performance sake.
-   */
-  template <typename MatBType, typename MatCType>
-  static void mul(
-      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(const Matrix& a, const Matrix& b);
-
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-  void rightMul(Matrix& b);
-
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-  void leftMul(Matrix& a);
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMaxId(IVector& maxIds);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& maxVal);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void rowNormalizeL1(Matrix& out);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output,
-                              Matrix& prevOut1,
-                              Matrix& prevOut2,
-                              Matrix& prevGrad1,
-                              Matrix& prevGrad2);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-
-  void smoothL1(Matrix& output, Matrix& label, real destScale);
-  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
-
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void print(std::ostream& os) const;
-  void print(std::ostream& os, size_t height, size_t width) const;
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-
-  real getMin();
-  real getMax();
-
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
-
-  void addByBitCodeBackward(size_t numClasses,
-                            const IVector& codes,
-                            Matrix& vec);
-
-  void mulByBitCode(size_t numClasses,
-                    const IVector& codes,
-                    const Matrix& mat,
-                    const Matrix& input);
-
-  void mulByBitCodeBackwardWeight(size_t numClasses,
-                                  const IVector& codes,
-                                  Matrix& mat,
-                                  const Matrix& input);
-
-  void mulByBitCodeBackwardError(size_t numClasses,
-                                 const IVector& codes,
-                                 const Matrix& mat,
-                                 Matrix& input);
-
-  void sumByBitCode(size_t numClasses,
-                    IVector& codes,
-                    Matrix& sum,
-                    real scaleSum);
-
-  void subByBitCode(size_t numClasses_, IVector& codes);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<real>(*this, expr);
-  }
-};
-
-class SharedCpuMatrix : public CpuMatrix {
- public:
-#ifndef PADDLE_MOBILE_INFERENCE
-  /* blockNum is number of partitions of the matrix  */
-  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(height, width, trans) {
-    initShared(blockNum);
-  }
-  SharedCpuMatrix(
-      int blockNum, real* data, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(data, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(int blockNum,
-                  CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initBlock(1);
-  }
-
-  ~SharedCpuMatrix() {}
-
- public:
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  virtual void add(Matrix& b, real p1, real p2);
-  virtual void add(real p1, real p2);
-
- private:
-  using Matrix::mul;
-  void initShared(int blockNum);
-  void initBlock(int blockNum);
-
-  int blockNum_;
-  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
-  ThreadLocal<CpuMatrixPtr> localBuf_;
-  ThreadLocal<std::vector<int>> localBufRows_;
-  ThreadLocal<std::vector<int>> blockSeq_;
-#endif
-};
-
-typedef struct { unsigned int col; } sparse_non_value_t;
-
-typedef struct {
-  unsigned int col;
-  float value;
-} sparse_float_value_t;
-
-}  // namespace paddle
-#include "ExecViaCpu.h"
diff --git a/paddle/legacy/math/MatrixBitCode.cpp b/paddle/legacy/math/MatrixBitCode.cpp
deleted file mode 100644
index f35f266a3..000000000
--- a/paddle/legacy/math/MatrixBitCode.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Matrix.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-namespace {
-
-struct SimpleCode {
-  SimpleCode(size_t code, size_t numClasses) : c_(code + numClasses) {}
-  inline size_t calcIndex(int bit) const { return (c_ >> (bit + 1)) - 1; }
-  inline bool calcBit(int bit) const { return c_ & (1 << bit); }
-  inline int getLength() const { return findLastSet(c_) - 1; }
-
- private:
-  size_t c_;
-};
-
-struct SimpleCodeTable {
-  explicit SimpleCodeTable(size_t numClasses) : numClasses_(numClasses) {}
-  SimpleCode operator()(size_t code) const {
-    return SimpleCode(code, numClasses_);
-  }
-  size_t size() const { return numClasses_; }
-  int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
-
- private:
-  size_t numClasses_;
-  int maxCodeLength_;
-};
-
-}  // namespace
-
-/**
- * CodeTable class should support 3 functions:
- *
- * size_t size()
- *   return the number of codes
- *
- * int getMaxCodeLength()
- *   return the maximal code length
- *
- * Code operator()(size_t i)
- *   return the i-th code. Code class is descriebed below.
- *
- * Code class should support 3 functions:
- *
- * int getLength()
- *   return the length of the code
- *
- * bool calcIndex(int bit)
- *   bit ranges from 0 to getLength() - 1
- *   return the index for the (1+bit) level parent
- *
- * bool calcBit(int bit)
- *   return true if the bit level parent is the right child of (1+bit) level
- *   parent
- *
- */
-
-/*
-   for i:
-     for j < codeLength:
-       op(tmat(i, j), vec(0, index(i, j)))
-*/
-template <class CodeTable, class Op, class TMat, class Mat>
-static void addByBitCodeT(
-    Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) {
-  CHECK(!vec.useGpu());
-
-  size_t numClasses = codeTable.size();
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(vec.getHeight(), (size_t)1);
-  CHECK_EQ(vec.getWidth(), numClasses - 1);
-
-  auto data = tmat.getData();
-  auto v = vec.getData();
-  const int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      size_t index = code.calcIndex(j);
-      op(data[i * oWidth + j], v[index]);
-    }
-  }
-}
-
-/* For j < codeLength:
-   this(i, j) += vec(0, index(i, j))
-*/
-void CpuMatrix::addByBitCode(size_t numClasses,
-                             const IVector& codes,
-                             const Matrix& vec) {
-  auto op = [](real& t, real v) { t += v; };
-  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
-}
-
-/* For j < codeLength:
-   vec(0, index(i, j)) += this(i, j)
-*/
-void CpuMatrix::addByBitCodeBackward(size_t numClasses,
-                                     const IVector& codes,
-                                     Matrix& vec) {
-  auto op = [](real t, real& v) { v += t; };
-  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
-}
-
-/*
-  for i:
-    for j < codeLength:
-      op(tmat(i, j), mat.row(index(i, j)), input.row(i))
-*/
-template <class Op,
-          class CodeTable,
-          class IVec,
-          class TMat,
-          class WMat,
-          class InMat>
-void mulByBitCodeT(Op op,
-                   CodeTable codeTable,
-                   IVec& codes,
-                   TMat& tmat,
-                   WMat& weight,
-                   InMat& input) {
-  CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
-
-  size_t numClasses = codeTable.size();
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t inputDim = input.getWidth();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(input.getHeight(), numSamples);
-  CHECK_EQ(weight.getHeight(), numClasses - 1);
-  CHECK_EQ(weight.getWidth(), inputDim);
-
-  real* data = tmat.getData();
-  const int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      size_t index = code.calcIndex(j);
-      op(data[i * oWidth + j], weight.rowBuf(index), input.rowBuf(i), inputDim);
-    }
-  }
-}
-
-/* For j < codeLength:
-   this(i, j) += <weight.row(index(i, j)), input.row(i)>
-*/
-void CpuMatrix::mulByBitCode(size_t numClasses,
-                             const IVector& codes,
-                             const Matrix& weight,
-                             const Matrix& input) {
-  auto op = [](
-      real& t, const real* weightRow, const real* inputRow, size_t inputDim) {
-    real sum = 0;
-    for (size_t k = 0; k < inputDim; ++k) {
-      sum += weightRow[k] * inputRow[k];
-    }
-    t += sum;
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-/* For index(i, j) >= 0:
-   weight.row(index(i, j)) += this(i, j) * input.row(i)
-*/
-void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
-                                           const IVector& codes,
-                                           Matrix& weight,
-                                           const Matrix& input) {
-  auto op = [](
-      const real t, real* weightRow, const real* inputRow, size_t inputDim) {
-    for (size_t k = 0; k < inputDim; ++k) {
-      weightRow[k] += t * inputRow[k];
-    }
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-/* For j < codeLength:
-   input.row(i) += this(i, j) * weight.row(index(i, j))
-*/
-void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
-                                          const IVector& codes,
-                                          const Matrix& weight,
-                                          Matrix& input) {
-  auto op = [](
-      const real t, const real* weightRow, real* inputRow, size_t inputDim) {
-    for (size_t k = 0; k < inputDim; ++k) {
-      inputRow[k] += t * weightRow[k];
-    }
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-template <class CodeTable>
-void sumByBitCodeT(CodeTable codeTable,
-                   IVector& codes,
-                   const CpuMatrix& tmat,
-                   Matrix& sum,
-                   real scaleSum) {
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(sum.getHeight(), numSamples);
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  const real* data = tmat.getData();
-  real* s = sum.getData();
-  int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    real sm = 0;
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      if (code.calcBit(j)) {
-        sm += data[i * oWidth + j];
-      }
-    }
-    s[i] = scaleSum * sm;
-  }
-}
-
-/* For j < codeLength:
-   sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
-*/
-void CpuMatrix::sumByBitCode(size_t numClasses,
-                             IVector& codes,
-                             Matrix& sum,
-                             real scaleSum) {
-  sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
-}
-
-template <class CodeTable>
-void subByBitCodeT(CodeTable codeTable, IVector& codes, CpuMatrix& tmat) {
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-
-  real* data = tmat.getData();
-  int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      if (code.calcBit(j)) {
-        data[i * oWidth + j] -= 1;
-      }
-    }
-  }
-}
-
-/* For j < codeLength
-   this(i, j) -= bit(i, j)
-*/
-void CpuMatrix::subByBitCode(size_t numClasses, IVector& codes) {
-  subByBitCodeT(SimpleCodeTable(numClasses), codes, *this);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MemoryHandle.cpp b/paddle/legacy/math/MemoryHandle.cpp
deleted file mode 100644
index 1563314e9..000000000
--- a/paddle/legacy/math/MemoryHandle.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MemoryHandle.h"
-#include <cmath>
-#include "Storage.h"
-
-namespace paddle {
-
-/**
- * Calculate the actual allocation size according to the required size.
- */
-MemoryHandle::MemoryHandle(size_t size) : size_(size), buf_(nullptr) {
-  if (size_ <= 256) {
-    // Memory allocation in cuda is always aligned to at least 256 bytes.
-    // In many cases it is 512 bytes.
-    allocSize_ = 256;
-  } else if (size_ <= 512) {
-    allocSize_ = 512;
-  } else if (size_ <= (1 << 16)) {
-    // Allocate multiple of 1024 bytes.
-    allocSize_ = (size + 1023) & ~(1023);
-  } else {
-    allocSize_ = size_;
-  }
-}
-
-GpuMemoryHandle::GpuMemoryHandle(size_t size) : MemoryHandle(size) {
-  CHECK(size != 0) << " allocate 0 bytes";
-  deviceId_ = hl_get_device();
-  allocator_ = StorageEngine::singleton()->getGpuAllocator(deviceId_);
-  buf_ = allocator_->alloc(allocSize_);
-}
-
-GpuMemoryHandle::~GpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
-
-CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) {
-  CHECK(size != 0) << " allocate 0 bytes";
-  allocator_ = StorageEngine::singleton()->getCpuAllocator();
-  buf_ = allocator_->alloc(allocSize_);
-}
-
-CpuMemoryHandle::~CpuMemoryHandle() { allocator_->free(buf_, allocSize_); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/MemoryHandle.h b/paddle/legacy/math/MemoryHandle.h
deleted file mode 100644
index 516e09dbe..000000000
--- a/paddle/legacy/math/MemoryHandle.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include "PoolAllocator.h"
-
-namespace paddle {
-
-class MemoryHandle {
- protected:
-  explicit MemoryHandle(size_t size);
-  virtual ~MemoryHandle() {}
-
- public:
-  void* getBuf() const { return buf_; }
-  size_t getSize() const { return size_; }
-  size_t getAllocSize() const { return allocSize_; }
-
- protected:
-  PoolAllocator* allocator_;
-  size_t size_;       // the requested size
-  size_t allocSize_;  // the allocated size
-  int deviceId_;      // the device id of memory if gpu memory
-  void* buf_;
-};
-
-/**
- * Wrapper class for raw gpu memory handle.
- *
- * The raw handle will be released at destructor
- */
-class GpuMemoryHandle : public MemoryHandle {
- public:
-  explicit GpuMemoryHandle(size_t size);
-  virtual ~GpuMemoryHandle();
-};
-
-/**
- * Wrapper class for raw cpu memory handle.
- *
- * The raw handle will be released at destructor
- */
-class CpuMemoryHandle : public MemoryHandle {
- public:
-  explicit CpuMemoryHandle(size_t size);
-  virtual ~CpuMemoryHandle();
-};
-
-typedef std::shared_ptr<MemoryHandle> MemoryHandlePtr;
-typedef std::shared_ptr<CpuMemoryHandle> CpuMemHandlePtr;
-typedef std::shared_ptr<GpuMemoryHandle> GpuMemHandlePtr;
-}  // namespace paddle
diff --git a/paddle/legacy/math/NEONFunctions.cpp b/paddle/legacy/math/NEONFunctions.cpp
deleted file mode 100644
index 953d5bb8c..000000000
--- a/paddle/legacy/math/NEONFunctions.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include "NEONFunctions.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace neon {
-
-// b[i] = a[i] > 0.0f ? a[i] : 0.0f
-void relu(const float* a, float* b, int len) {
-  int offset = len % 16;
-  float32x4_t ma0, ma1, ma2, ma3;
-  float32x4_t mb0, mb1, mb2, mb3;
-
-  float32x4_t zero = vdupq_n_f32(0.f);
-  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
-    ma0 = vld1q_f32(a);
-    ma1 = vld1q_f32(a + 4);
-    ma2 = vld1q_f32(a + 8);
-    ma3 = vld1q_f32(a + 12);
-
-    mb0 = vmaxq_f32(ma0, zero);
-    mb1 = vmaxq_f32(ma1, zero);
-    mb2 = vmaxq_f32(ma2, zero);
-    mb3 = vmaxq_f32(ma3, zero);
-
-    vst1q_f32(b, mb0);
-    vst1q_f32(b + 4, mb1);
-    vst1q_f32(b + 8, mb2);
-    vst1q_f32(b + 12, mb3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    b[i] = a[i] > 0.0f ? a[i] : 0.0f;
-  }
-}
-
-// b[i] = a[i] > 0.0f ? a[i] : a[i] * w
-void prelu(const float* a, float w, float* b, int len) {
-  int offset = len % 16;
-  float32x4_t ma0, ma1, ma2, ma3;
-
-  float32x4_t zero = vdupq_n_f32(0.f);
-  float32x4_t vw = vdupq_n_f32(w);
-
-  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
-    ma0 = vld1q_f32(a);
-    ma1 = vld1q_f32(a + 4);
-    ma2 = vld1q_f32(a + 8);
-    ma3 = vld1q_f32(a + 12);
-
-    uint32x4_t flag0 = vcgtq_f32(ma0, zero);
-    uint32x4_t flag1 = vcgtq_f32(ma1, zero);
-    uint32x4_t flag2 = vcgtq_f32(ma2, zero);
-    uint32x4_t flag3 = vcgtq_f32(ma3, zero);
-
-    float32x4_t mul0 = vmulq_f32(ma0, vw);
-    float32x4_t mul1 = vmulq_f32(ma1, vw);
-    float32x4_t mul2 = vmulq_f32(ma2, vw);
-    float32x4_t mul3 = vmulq_f32(ma3, vw);
-
-    ma0 = vbslq_f32(flag0, ma0, mul0);
-    ma1 = vbslq_f32(flag1, ma1, mul1);
-    ma2 = vbslq_f32(flag2, ma2, mul2);
-    ma3 = vbslq_f32(flag3, ma3, mul3);
-
-    vst1q_f32(b, ma0);
-    vst1q_f32(b + 4, ma1);
-    vst1q_f32(b + 8, ma2);
-    vst1q_f32(b + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    b[i] = a[i] > 0.0f ? a[i] : a[i] * w;
-  }
-}
-
-}  // namespace neon
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/NEONFunctions.h b/paddle/legacy/math/NEONFunctions.h
deleted file mode 100644
index 33edd9d51..000000000
--- a/paddle/legacy/math/NEONFunctions.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-namespace neon {
-
-void relu(const float* a, float* b, int len);
-void prelu(const float* a, float w, float* b, int len);
-
-}  // namespace neon
-}  // namespace paddle
diff --git a/paddle/legacy/math/PoolAllocator.cpp b/paddle/legacy/math/PoolAllocator.cpp
deleted file mode 100644
index b6ad16885..000000000
--- a/paddle/legacy/math/PoolAllocator.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolAllocator.h"
-
-namespace paddle {
-
-PoolAllocator::PoolAllocator(Allocator* allocator,
-                             size_t sizeLimit,
-                             const std::string& name)
-    : allocator_(allocator),
-      sizeLimit_(sizeLimit),
-      poolMemorySize_(0),
-      name_(name) {}
-
-PoolAllocator::~PoolAllocator() { freeAll(); }
-
-void* PoolAllocator::alloc(size_t size) {
-  if (sizeLimit_ > 0) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    auto it = pool_.find(size);
-    if (it == pool_.end() || it->second.size() == 0) {
-      if (poolMemorySize_ >= sizeLimit_) {
-        freeAll();
-      }
-      return allocator_->alloc(size);
-    } else {
-      auto buf = it->second.back();
-      it->second.pop_back();
-      poolMemorySize_ -= size;
-      return buf;
-    }
-  } else {
-    return allocator_->alloc(size);
-  }
-}
-
-void PoolAllocator::free(void* ptr, size_t size) {
-  if (sizeLimit_ > 0) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    auto& it = pool_[size];
-    it.push_back(ptr);
-    poolMemorySize_ += size;
-  } else {
-    allocator_->free(ptr);
-  }
-}
-
-void PoolAllocator::freeAll() {
-  for (auto it : pool_) {
-    for (auto ptr : it.second) {
-      allocator_->free(ptr);
-    }
-  }
-  poolMemorySize_ = 0;
-  pool_.clear();
-}
-
-void PoolAllocator::printAll() {
-  size_t memory = 0;
-  LOG(INFO) << name_ << ":";
-  for (auto it : pool_) {
-    LOG(INFO) << "  size:" << it.first;
-    for (auto ptr : it.second) {
-      LOG(INFO) << "    ptr:" << ptr;
-      memory += it.first;
-    }
-  }
-  LOG(INFO) << "memory size: " << memory;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/PoolAllocator.h b/paddle/legacy/math/PoolAllocator.h
deleted file mode 100644
index 7239cf1c4..000000000
--- a/paddle/legacy/math/PoolAllocator.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <vector>
-#include "Allocator.h"
-
-namespace paddle {
-
-/**
- * @brief Memory pool allocator implementation.
- */
-class PoolAllocator {
- public:
-  /**
-   * @brief constructor.
-   * @param allocator a Allocator object.
-   * @param sizeLimit The maximum size memory can be managed,
-   * if sizeLimit == 0, the pool allocator is a simple wrapper of allocator.
-   */
-  PoolAllocator(Allocator* allocator,
-                size_t sizeLimit = 0,
-                const std::string& name = "pool");
-
-  /**
-   * @brief destructor.
-   */
-  ~PoolAllocator();
-
-  void* alloc(size_t size);
-  void free(void* ptr, size_t size);
-  std::string getName() { return name_; }
-
- private:
-  void freeAll();
-  void printAll();
-  std::unique_ptr<Allocator> allocator_;
-  std::mutex mutex_;
-  std::unordered_map<size_t, std::vector<void*>> pool_;
-  size_t sizeLimit_;
-  size_t poolMemorySize_;
-  std::string name_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/RowBuffer.h b/paddle/legacy/math/RowBuffer.h
deleted file mode 100644
index 9dfd5eff0..000000000
--- a/paddle/legacy/math/RowBuffer.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "MemoryHandle.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * @brief The RowBuffer class
- * Represent the SparseRow Matrix Data.
- *
- * If not set memory handler, then the data could be auto growth.
- */
-class RowBuffer {
- public:
-  /**
-   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
-   * @param width the length of each row, a.k.a matrix width.
-   */
-  explicit RowBuffer(size_t width) : width_(width) {}
-
-  /**
-   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
-   * @param mem the pre-allocated memory.
-   * @param width the length of each row, a.k.a matrix width.
-   */
-  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
-      : preallocatedBuf_(mem), width_(width) {}
-
-  /**
-   * @brief resize resize the buffer with rowCount
-   * @param rowCnt number of row. matrix height.
-   */
-  inline void resize(int rowCnt) {
-    if (preallocatedBuf_) {
-      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
-    } else {
-      rowStore_.resize(rowCnt * width_);
-    }
-  }
-
-  /**
-   * @brief get a row buffer with row index.
-   * @param row the index of row.
-   * @return row buffer.
-   */
-  inline real* get(int row) const {
-    if (preallocatedBuf_) {
-      CHECK_LE((row)*width_ * sizeof(real), preallocatedBuf_->getSize());
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
-    } else {
-      CHECK_LE((row + 1) * width_, rowStore_.size());
-      return const_cast<real*>(rowStore_.data() + row * width_);
-    }
-  }
-
-  /**
-   * @brief get a row buffer with row index. If row index is larger than local
-   *        buffer, the size of local buffer will grow.
-   * @param row the index of row.
-   * @return row buffer.
-   */
-  inline real* getWithAutoGrowth(int row) {
-    if (preallocatedBuf_) {
-      return get(row);
-    } else {
-      if ((rowStore_.size() <= row * width_)) {
-        rowStore_.resize((row + 1) * width_);
-      }
-      return rowStore_.data() + row * width_;
-    }
-  }
-
-  /**
-   * @return raw data buffer.
-   */
-  inline real* data() {
-    if (preallocatedBuf_) {
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
-    } else {
-      return rowStore_.data();
-    }
-  }
-
-  /**
-   * @brief clear local buffer. It only affect auto-growth buffer.
-   */
-  inline void clear() {
-    // swap an empty vector to it to free the memory.
-    std::vector<real, AlignedAllocator<real, 32>> empty;
-    rowStore_.swap(empty);
-  }
-
-  /**
-   * @brief get current number of rows.
-   * @return number of rows.
-   */
-  inline size_t getRowCount() const {
-    if (preallocatedBuf_) {
-      return preallocatedBuf_->getSize() / sizeof(real) / width_;
-    } else {
-      return rowStore_.size() / width_;
-    }
-  }
-
-  /**
-   * @brief get is this buffer can automatically grow or not.
-   * @return ture if can automacitally grow.
-   */
-  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
-
-  /**
-   * @brief return the width of matrix. a.k.a length of row.
-   * @return width of matrix
-   */
-  inline size_t getWidth() const { return width_; }
-
- private:
-  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
-  //! of std::vector here.
-  CpuMemHandlePtr preallocatedBuf_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
-  size_t width_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/math/SIMDFunctions.cpp b/paddle/legacy/math/SIMDFunctions.cpp
deleted file mode 100644
index 3cfc5d6f1..000000000
--- a/paddle/legacy/math/SIMDFunctions.cpp
+++ /dev/null
@@ -1,397 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SIMDFunctions.h"
-#ifdef __SSE3__
-#include <immintrin.h>
-#endif
-#include <algorithm>
-
-#ifdef __AVX__
-static void addto_avx(float* a, const float* b, size_t len) {
-  int offset = len % 32;
-
-  __m256 ma0, ma1, ma2, ma3;
-  __m256 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 32; k++, a += 32, b += 32) {
-    ma0 = _mm256_load_ps(a);
-    ma1 = _mm256_load_ps(a + 8);
-    ma2 = _mm256_load_ps(a + 16);
-    ma3 = _mm256_load_ps(a + 24);
-
-    mb0 = _mm256_load_ps(b);
-    mb1 = _mm256_load_ps(b + 8);
-    mb2 = _mm256_load_ps(b + 16);
-    mb3 = _mm256_load_ps(b + 24);
-
-    ma0 = _mm256_add_ps(ma0, mb0);
-    ma1 = _mm256_add_ps(ma1, mb1);
-    ma2 = _mm256_add_ps(ma2, mb2);
-    ma3 = _mm256_add_ps(ma3, mb3);
-
-    _mm256_store_ps(a, ma0);
-    _mm256_store_ps(a + 8, ma1);
-    _mm256_store_ps(a + 16, ma2);
-    _mm256_store_ps(a + 24, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) a[i] += b[i];
-
-  return;
-}
-
-static void batch_addto_avx(float* a, const float* b[], int batch, size_t len) {
-  int offset = len % 32;
-
-  __m256 ma0, ma1, ma2, ma3;
-  __m256 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 32; k++, a += 32) {
-    ma0 = _mm256_load_ps(a);
-    ma1 = _mm256_load_ps(a + 8);
-    ma2 = _mm256_load_ps(a + 16);
-    ma3 = _mm256_load_ps(a + 24);
-
-    for (int i = 0; i < batch; i++) {
-      mb0 = _mm256_load_ps(b[i]);
-      mb1 = _mm256_load_ps(b[i] + 8);
-      mb2 = _mm256_load_ps(b[i] + 16);
-      mb3 = _mm256_load_ps(b[i] + 24);
-      ma0 = _mm256_add_ps(ma0, mb0);
-      ma1 = _mm256_add_ps(ma1, mb1);
-      ma2 = _mm256_add_ps(ma2, mb2);
-      ma3 = _mm256_add_ps(ma3, mb3);
-      b[i] += 32;
-    }
-
-    _mm256_store_ps(a, ma0);
-    _mm256_store_ps(a + 8, ma1);
-    _mm256_store_ps(a + 16, ma2);
-    _mm256_store_ps(a + 24, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    for (int k = 0; k < batch; k++) a[i] += b[k][i];
-  }
-  return;
-}
-
-static void col_max_avx(float* result,
-                        const float* data,
-                        int dim,
-                        int numSamples) {
-  // first sample, direct copy
-  for (int d = 0; d < dim; ++d) {
-    result[d] = data[d];
-  }
-  int offset = dim % 32;
-  __m256 ma0, ma1, ma2, ma3;
-  __m256 mb0, mb1, mb2, mb3;
-  // first 16n dims
-  for (int k = 0; k < dim / 32; k++, result += 32, data += 32) {
-    ma0 = _mm256_load_ps(result);
-    ma1 = _mm256_load_ps(result + 8);
-    ma2 = _mm256_load_ps(result + 16);
-    ma3 = _mm256_load_ps(result + 24);
-    for (int i = 1; i < numSamples; i++) {
-      mb0 = _mm256_load_ps(data + i * dim);
-      mb1 = _mm256_load_ps(data + i * dim + 8);
-      mb2 = _mm256_load_ps(data + i * dim + 16);
-      mb3 = _mm256_load_ps(data + i * dim + 24);
-      ma0 = _mm256_max_ps(ma0, mb0);
-      ma1 = _mm256_max_ps(ma1, mb1);
-      ma2 = _mm256_max_ps(ma2, mb2);
-      ma3 = _mm256_max_ps(ma3, mb3);
-    }
-    _mm256_store_ps(result, ma0);
-    _mm256_store_ps(result + 8, ma1);
-    _mm256_store_ps(result + 16, ma2);
-    _mm256_store_ps(result + 24, ma3);
-  }
-  // last dims
-  for (int d = 0; d < offset; ++d) {
-    float sm = data[d];
-    for (int i = 1; i < numSamples; ++i) {
-      sm = std::max(sm, data[i * dim + d]);
-    }
-    result[d] = sm;
-  }
-}
-
-static void decayL1_avx(float* dst, float* src, float lambda, size_t sz) {
-  int64_t i;
-  int64_t size = sz;
-  float src_val;
-
-  __m256 ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
-  //  __m256 ymm9, ymm10;
-
-  ymm1 = _mm256_set1_ps(lambda);
-  ymm2 = _mm256_setzero_ps();
-
-  for (i = 0; i <= size - 16; i += 16) {
-    ymm3 = _mm256_load_ps(src + i);
-    ymm6 = _mm256_load_ps(src + i + 8);
-
-    ymm4 = _mm256_sub_ps(ymm3, ymm1);
-    ymm7 = _mm256_sub_ps(ymm6, ymm1);
-
-    ymm5 = _mm256_add_ps(ymm3, ymm1);
-    ymm8 = _mm256_add_ps(ymm6, ymm1);
-
-    ymm4 = _mm256_max_ps(ymm4, ymm2);
-    ymm7 = _mm256_max_ps(ymm7, ymm2);
-
-    ymm5 = _mm256_min_ps(ymm5, ymm2);
-    ymm8 = _mm256_min_ps(ymm8, ymm2);
-
-    ymm5 = _mm256_or_ps(ymm4, ymm5);
-    ymm8 = _mm256_or_ps(ymm7, ymm8);
-
-    _mm256_store_ps(dst + i, ymm5);
-    _mm256_store_ps(dst + i + 8, ymm8);
-  }
-  if (i <= size - 8) {
-    ymm3 = _mm256_load_ps(src + i);
-    ymm4 = _mm256_sub_ps(ymm3, ymm1);
-    ymm5 = _mm256_add_ps(ymm3, ymm1);
-    ymm4 = _mm256_max_ps(ymm4, ymm2);
-    ymm5 = _mm256_min_ps(ymm5, ymm2);
-    ymm5 = _mm256_or_ps(ymm4, ymm5);
-    _mm256_store_ps(dst + i, ymm5);
-
-    i += 8;
-  }
-  for (; i < size; i++) {
-    src_val = src[i];
-    if (src_val > 0) {
-      dst[i] = ((src_val > lambda) ? (src_val - lambda) : 0);
-    } else {
-      dst[i] = ((-src_val > lambda) ? (src_val + lambda) : 0);
-    }
-  }
-}
-
-static void decayL1_avx(
-    float* dst, float* src, float* lr, float lambda, size_t sz) {
-  int64_t i;
-  int64_t size = sz;
-  float src_val;
-
-  __m256 ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8;
-  __m256 ymm9, ymm10;
-
-  ymm1 = _mm256_set1_ps(lambda);
-  ymm2 = _mm256_setzero_ps();
-
-  for (i = 0; i <= size - 16; i += 16) {
-    ymm9 = _mm256_load_ps(lr + i);
-    ymm10 = _mm256_load_ps(lr + i + 8);
-
-    ymm3 = _mm256_load_ps(src + i);
-    ymm6 = _mm256_load_ps(src + i + 8);
-
-    ymm9 = _mm256_mul_ps(ymm9, ymm1);
-    ymm10 = _mm256_mul_ps(ymm10, ymm1);
-
-    ymm4 = _mm256_sub_ps(ymm3, ymm9);
-    ymm7 = _mm256_sub_ps(ymm6, ymm10);
-
-    ymm5 = _mm256_add_ps(ymm3, ymm9);
-    ymm8 = _mm256_add_ps(ymm6, ymm10);
-
-    ymm4 = _mm256_max_ps(ymm4, ymm2);
-    ymm7 = _mm256_max_ps(ymm7, ymm2);
-
-    ymm5 = _mm256_min_ps(ymm5, ymm2);
-    ymm8 = _mm256_min_ps(ymm8, ymm2);
-
-    ymm5 = _mm256_or_ps(ymm4, ymm5);
-    ymm8 = _mm256_or_ps(ymm7, ymm8);
-
-    _mm256_store_ps(dst + i, ymm5);
-    _mm256_store_ps(dst + i + 8, ymm8);
-  }
-  if (i <= size - 8) {
-    ymm3 = _mm256_load_ps(src + i);
-    ymm9 = _mm256_load_ps(lr + i);
-    ymm9 = _mm256_mul_ps(ymm9, ymm1);
-    ymm4 = _mm256_sub_ps(ymm3, ymm9);
-    ymm5 = _mm256_add_ps(ymm3, ymm9);
-    ymm4 = _mm256_max_ps(ymm4, ymm2);
-    ymm5 = _mm256_min_ps(ymm5, ymm2);
-    ymm5 = _mm256_or_ps(ymm4, ymm5);
-    _mm256_store_ps(dst + i, ymm5);
-
-    i += 8;
-  }
-  for (; i < size; i++) {
-    src_val = src[i];
-    float nlambda = lr[i] * lambda;
-    if (src_val > 0) {
-      dst[i] = ((src_val > nlambda) ? (src_val - nlambda) : 0);
-    } else {
-      dst[i] = ((-src_val > nlambda) ? (src_val + nlambda) : 0);
-    }
-  }
-}
-
-#elif defined(__SSE3__)
-
-static void addto_sse(float* a, const float* b, size_t len) {
-  int offset = len % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    mb0 = _mm_load_ps(b);
-    mb1 = _mm_load_ps(b + 4);
-    mb2 = _mm_load_ps(b + 8);
-    mb3 = _mm_load_ps(b + 12);
-
-    ma0 = _mm_add_ps(ma0, mb0);
-    ma1 = _mm_add_ps(ma1, mb1);
-    ma2 = _mm_add_ps(ma2, mb2);
-    ma3 = _mm_add_ps(ma3, mb3);
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) a[i] += b[i];
-}
-
-static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
-  int offset = len % 16;
-
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    for (int i = 0; i < batch; i++) {
-      mb0 = _mm_load_ps(b[i]);
-      mb1 = _mm_load_ps(b[i] + 4);
-      mb2 = _mm_load_ps(b[i] + 8);
-      mb3 = _mm_load_ps(b[i] + 12);
-      ma0 = _mm_add_ps(ma0, mb0);
-      ma1 = _mm_add_ps(ma1, mb1);
-      ma2 = _mm_add_ps(ma2, mb2);
-      ma3 = _mm_add_ps(ma3, mb3);
-      b[i] += 16;
-    }
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    for (int k = 0; k < batch; k++) a[i] += b[k][i];
-  }
-  return;
-}
-
-static void col_max_sse(float* result,
-                        const float* data,
-                        int dim,
-                        int numSamples) {
-  // first sample, direct copy
-  for (int d = 0; d < dim; ++d) {
-    result[d] = data[d];
-  }
-  int offset = dim % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-  // first 16n dims
-  for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
-    ma0 = _mm_load_ps(result);
-    ma1 = _mm_load_ps(result + 4);
-    ma2 = _mm_load_ps(result + 8);
-    ma3 = _mm_load_ps(result + 12);
-    for (int i = 1; i < numSamples; i++) {
-      mb0 = _mm_load_ps(data + i * dim);
-      mb1 = _mm_load_ps(data + i * dim + 4);
-      mb2 = _mm_load_ps(data + i * dim + 8);
-      mb3 = _mm_load_ps(data + i * dim + 12);
-      ma0 = _mm_max_ps(ma0, mb0);
-      ma1 = _mm_max_ps(ma1, mb1);
-      ma2 = _mm_max_ps(ma2, mb2);
-      ma3 = _mm_max_ps(ma3, mb3);
-    }
-    _mm_store_ps(result, ma0);
-    _mm_store_ps(result + 4, ma1);
-    _mm_store_ps(result + 8, ma2);
-    _mm_store_ps(result + 12, ma3);
-  }
-  // last dims
-  for (int d = 0; d < offset; ++d) {
-    float sm = data[d];
-    for (int i = 1; i < numSamples; ++i) {
-      sm = std::max(sm, data[i * dim + d]);
-    }
-    result[d] = sm;
-  }
-}
-
-#endif
-
-#if defined(__AVX__)
-#define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
-#elif defined(__SSE3__)
-#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
-#endif
-
-namespace paddle {
-namespace simd {
-namespace internal {
-#ifdef __SSE3__
-void addToImpl(float* a, const float* b, size_t len) {
-  SIMD_INVOKE(addto, a, b, len);
-}
-void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
-  SIMD_INVOKE(batch_addto, a, b, batch, len);
-}
-
-void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
-  SIMD_INVOKE(col_max, result, data, dim, numSamples);
-}
-#endif
-
-#ifdef __AVX__
-void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
-  decayL1_avx(dst, src, lambda, len);
-}
-void decayL1AvxImpl(
-    float* dst, float* src, float* lr, float lambda, size_t len) {
-  decayL1_avx(dst, src, lr, lambda, len);
-}
-#endif
-
-}  // namespace internal
-}  // namespace simd
-}  // namespace paddle
diff --git a/paddle/legacy/math/SIMDFunctions.h b/paddle/legacy/math/SIMDFunctions.h
deleted file mode 100644
index 5b1dfea9d..000000000
--- a/paddle/legacy/math/SIMDFunctions.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stddef.h>
-#include <stdint.h>
-
-namespace paddle {
-
-namespace simd {
-
-namespace naive {
-template <typename Type>
-inline void addTo(Type* a, const Type* b, size_t len) {
-  for (size_t i = 0; i < len; ++i) {
-    a[i] += b[i];
-  }
-}
-
-template <typename Type>
-inline void batchAddTo(Type* a, const Type* b[], int batch, size_t len) {
-  for (int i = 0; i < batch; ++i) {
-    for (size_t j = 0; j < len; ++j) {
-      a[j] += b[i][j];
-    }
-  }
-}
-
-/**
- * @note this method is unused in paddle.
- */
-template <typename Type>
-inline void colMax(Type* result, const Type* data, int dim, int numSamples) {
-  for (int d = 0; d < dim; ++d) {
-    Type sm = data[d];
-    for (int i = 1; i < numSamples; ++i) {
-      sm = sm > data[i * dim + d] ? sm : data[i * dim + d];
-    }
-    result[d] = sm;
-  }
-}
-
-template <typename Type>
-inline void decayL1(Type* dst, Type* src, Type* lr, Type lambda, size_t len) {
-  for (size_t i = 0; i < len; ++i) {
-    Type& src_val = src[i];
-    float nlambda = lr[i] * lambda;
-    if (src_val > 0) {
-      dst[i] = ((src_val > nlambda) ? (src_val - nlambda) : 0);
-    } else {
-      dst[i] = ((-src_val > nlambda) ? (src_val + nlambda) : 0);
-    }
-  }
-}
-
-template <class Type>
-inline void decayL1(Type* dst, Type* src, Type lambda, size_t len) {
-  for (size_t i = 0; i < len; ++i) {
-    Type& src_val = src[i];
-    if (src_val > 0) {
-      dst[i] = ((src_val > lambda) ? (src_val - lambda) : 0);
-    } else {
-      dst[i] = ((-src_val > lambda) ? (src_val + lambda) : 0);
-    }
-  }
-}
-}  // namespace naive
-
-template <typename Type>
-inline void addTo(Type* a, const Type* b, size_t len) {
-  naive::addTo(a, b, len);
-}
-
-template <typename Type>
-inline void batchAddTo(Type* a, const Type* b[], int batch, size_t len) {
-  naive::batchAddTo(a, b, batch, len);
-}
-
-template <typename Type>
-inline void colMax(Type* result, const Type* data, int dim, int numSamples) {
-  naive::colMax(result, data, dim, numSamples);
-}
-
-template <typename Type>
-inline void decayL1(Type* dst, Type* src, Type* lr, Type lambda, size_t len) {
-  naive::decayL1(dst, src, lr, lambda, len);
-}
-
-template <typename Type>
-inline void decayL1(Type* dst, Type* src, Type lambda, size_t len) {
-  naive::decayL1(dst, src, lambda, len);
-}
-
-template <size_t AlignSize>
-inline bool isPointerAlign(void* ptr) {
-  return reinterpret_cast<uintptr_t>(ptr) % AlignSize == 0;
-}
-
-inline bool vec_check(size_t len) {
-#ifdef __AVX__
-  return len % 8 == 0;
-#else
-  return len % 4 == 0;
-#endif
-}
-
-namespace internal {
-#ifdef __SSE3__
-void addToImpl(float* a, const float* b, size_t len);
-void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
-void colMaxImpl(float* result, const float* data, int dim, int numSamples);
-#endif
-#ifdef __AVX__
-void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
-void decayL1AvxImpl(
-    float* dst, float* src, float* lr, float lambda, size_t len);
-#endif
-}  // namespace internal
-
-template <>
-inline void addTo(float* a, const float* b, size_t len) {
-#ifdef __SSE3__
-  internal::addToImpl(a, b, len);
-#else
-  naive::addTo(a, b, len);
-#endif
-}
-
-template <>
-inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
-#ifdef __SSE3__
-  internal::batchAddToImpl(a, b, batch, len);
-#else
-  naive::batchAddTo(a, b, batch, len);
-#endif
-}
-
-template <>
-inline void colMax(float* result, const float* data, int dim, int numSamples) {
-#ifdef __SSE3__
-  internal::colMaxImpl(result, data, dim, numSamples);
-#else
-  naive::colMax(result, data, dim, numSamples);
-#endif
-}
-
-template <>
-inline void decayL1(float* dst, float* src, float lambda, size_t len) {
-#ifdef __AVX__
-  internal::decayL1AvxImpl(dst, src, lambda, len);
-#else
-  naive::decayL1(dst, src, lambda, len);
-#endif
-}
-
-template <>
-inline void decayL1(
-    float* dst, float* src, float* lr, float lambda, size_t len) {
-#ifdef __AVX__
-  internal::decayL1AvxImpl(dst, src, lr, lambda, len);
-#else
-  naive::decayL1(dst, src, lr, lambda, len);
-#endif
-}
-
-}  // namespace simd
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/SparseMatrix.cpp b/paddle/legacy/math/SparseMatrix.cpp
deleted file mode 100644
index 6f68252b0..000000000
--- a/paddle/legacy/math/SparseMatrix.cpp
+++ /dev/null
@@ -1,864 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SparseMatrix.h"
-#include <algorithm>
-#include <iostream>
-#include <vector>
-#include "hl_gpu.h"
-#include "hl_top_k.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-GpuSparseMatrix::GpuSparseMatrix(size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, true) {
-  resize(height, width, nnz, valueType, format);
-}
-
-GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                                 hl_sparse_matrix_s_ptr sMatrix,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans,
-                                 MemoryHandlePtr sMemoryHandle)
-    : Matrix(dataHandle, height, width, trans, true) {
-  CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
-
-  size_t size = 0;
-  if (format == SPARSE_CSR) {
-    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
-  } else {
-    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    size += nnz * sizeof(real);
-  }
-  CHECK_LE(size, dataHandle->getSize());
-
-  sMatrix_ = sMatrix;
-
-  if (sMemoryHandle == NULL) {
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(dataHandle->getSize());
-  } else {
-    CHECK_EQ(sMemoryHandle->getSize(), dataHandle->getSize());
-    sMemoryHandle_ = sMemoryHandle;
-  }
-
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-  if (format_ == SPARSE_CSR)
-    sparseResizeCSR();
-  else
-    sparseResizeCSC();
-}
-
-GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans,
-                                 MemoryHandlePtr sMemoryHandle)
-    : Matrix(NULL, height, width, trans, true) {
-  CHECK(sMatrix) << "Invalid argument pointer";
-  sMatrix_ = sMatrix;
-  sMemoryHandle_ = sMemoryHandle;
-  elementCnt_ = nnz;
-  format_ = format;
-  valueType_ = valueType;
-}
-
-GpuSparseMatrix::GpuSparseMatrix(real* value,
-                                 int* rows,
-                                 int* cols,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, true) {
-  size_t size = 0;
-  if (format == SPARSE_CSR) {
-    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
-  } else {
-    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    size += nnz * sizeof(real);
-  }
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-
-  sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(size);
-  if (format_ == SPARSE_CSR) {
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-
-    if (sMatrix_ == NULL) {
-      /* construct hl_sparse_matrix_s */
-      hl_sparse_matrix_s tmp;
-      hl_construct_sparse_matrix(
-          &tmp,
-          value,
-          rows,
-          cols,
-          HL_SPARSE_CSR,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-          height_,
-          width_,
-          elementCnt_);
-      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-      sMatrix_ = tmp2;
-    }
-
-  } else {
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-
-    if (sMatrix_ == NULL) {
-      /* construct hl_sparse_matrix_s */
-      hl_sparse_matrix_s tmp;
-      hl_construct_sparse_matrix(
-          &tmp,
-          value,
-          rows,
-          cols,
-          HL_SPARSE_CSC,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-          height_,
-          width_,
-          elementCnt_);
-      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-      sMatrix_ = tmp2;
-    }
-  }
-}
-
-void GpuSparseMatrix::sparseResizeCSR() {
-  rows_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-  cols_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-                             (height_ + 1) * sizeof(int));
-  if (NO_VALUE != valueType_) {
-    value_ = reinterpret_cast<real*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-  } else {
-    value_ = NULL;
-  }
-
-  if (sMatrix_ == NULL) {
-    /* construct hl_sparse_matrix_s */
-    hl_sparse_matrix_s tmp;
-    hl_construct_sparse_matrix(
-        &tmp,
-        data_,
-        memoryHandle_->getSize(),
-        HL_SPARSE_CSR,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-        height_,
-        width_,
-        elementCnt_);
-    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-    sMatrix_ = tmp2;
-  }
-}
-
-void GpuSparseMatrix::sparseResizeCSC() {
-  cols_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-  rows_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-                             (width_ + 1) * sizeof(int));
-  if (NO_VALUE != valueType_) {
-    value_ = reinterpret_cast<real*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-  } else {
-    value_ = NULL;
-  }
-
-  if (sMatrix_ == NULL) {
-    /* construct hl_sparse_matrix_s */
-    hl_sparse_matrix_s tmp;
-    hl_construct_sparse_matrix(
-        &tmp,
-        memoryHandle_->getBuf(),
-        memoryHandle_->getSize(),
-        HL_SPARSE_CSC,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-        height_,
-        width_,
-        elementCnt_);
-    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-    sMatrix_ = tmp2;
-  }
-}
-
-void GpuSparseMatrix::resize(size_t newHeight,
-                             size_t newWidth,
-                             size_t newNnz,
-                             SparseValueType valueType,
-                             SparseFormat format) {
-  if (format == SPARSE_CSR) {
-    resizeCSR(newHeight, newWidth, newNnz, valueType);
-  } else {
-    resizeCSC(newHeight, newWidth, newNnz, valueType);
-  }
-}
-
-void GpuSparseMatrix::resizeCSR(size_t newHeight,
-                                size_t newWidth,
-                                size_t newNnz,
-                                SparseValueType valueType) {
-  size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-           sMemoryHandle_->getSize();
-    sMatrix_ = NULL;
-  } else if (valueType != valueType_) {
-    sMatrix_ = NULL;
-  } else {
-    /*
-     * newNnz > elementCnt_ is necessary for the following condition:
-     * Firstly, height_ is 9 elementCnt_ is 56
-     * Secondly, height_ is 11 elementCnt_ is 44
-     *   ==> height_ is bigger, sMatrix_ will resize, and total item is 44 now
-     * Then, height_ is 10 elementCnt_ is 52
-     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
-     */
-    if ((ssize_t)((newHeight + 1) * sizeof(int)) >
-            ((char*)cols_ - (char*)rows_) ||
-        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
-      sMatrix_ = NULL;
-    } else if (NO_VALUE == valueType) {
-      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)cols_)) {
-        sMatrix_ = NULL;
-      }
-    } else {
-      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)cols_) ||
-          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
-        sMatrix_ = NULL;
-      }
-    }
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = SPARSE_CSR;
-
-  if (sMatrix_ == NULL) {
-    sparseResizeCSR();
-  }
-}
-
-void GpuSparseMatrix::resizeCSC(size_t newHeight,
-                                size_t newWidth,
-                                size_t newNnz,
-                                SparseValueType valueType) {
-  size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-           sMemoryHandle_->getSize();
-    sMatrix_ = NULL;
-  } else if (valueType != valueType_) {
-    sMatrix_ = NULL;
-  } else {
-    /*
-     * newNnz > elementCnt_ is necessary for the following condition:
-     * Firstly, height_ is 9 elementCnt_ is 56
-     * Secondly, height_ is 11 elementCnt_ is 44
-     *   ==> height_ is bigger, sMatrix_ will resize,
-     *       and total item is 44 now
-     * Then, height_ is 10 elementCnt_ is 52
-     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
-     */
-    if ((ssize_t)((newWidth + 1) * sizeof(int)) >
-            ((char*)rows_ - (char*)cols_) ||
-        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
-      sMatrix_ = NULL;
-    } else if (NO_VALUE == valueType) {
-      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)rows_)) {
-        sMatrix_ = NULL;
-      }
-    } else {
-      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)rows_) ||
-          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
-        sMatrix_ = NULL;
-      }
-    }
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = SPARSE_CSC;
-
-  if (sMatrix_ == NULL) {
-    sparseResizeCSC();
-  }
-}
-
-void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight, newWidth, elementCnt_, valueType_, format_);
-}
-
-MatrixPtr GpuSparseMatrix::getTranspose() {
-  CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
-  if (memoryHandle_.get()) {
-    MatrixPtr copy_T(new GpuSparseMatrix(
-        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-        sMatrix_,
-        height_,
-        width_,
-        elementCnt_,
-        valueType_,
-        format_,
-        true,
-        sMemoryHandle_));
-    return copy_T;
-  } else {
-    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_,
-                                         height_,
-                                         width_,
-                                         elementCnt_,
-                                         valueType_,
-                                         format_,
-                                         true,
-                                         sMemoryHandle_));
-    return copy_T;
-  }
-}
-
-void GpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_non_value_t* row) {
-  memcpy(cols_ + offsets, row, sizeof(int) * colNum);
-}
-
-void GpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_float_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-    value_[offsets + j] = row[j].value;
-  }
-}
-
-void GpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  if (auto mat = dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    copyFrom(*(const_cast<CpuSparseMatrix*>(mat)), stream);
-  } else if (auto mat = dynamic_cast<const GpuSparseMatrix*>(&src)) {
-    copyFrom(*(const_cast<GpuSparseMatrix*>(mat)), stream);
-  } else {
-    LOG(FATAL) << "Not implemented";
-  }
-}
-
-void GpuSparseMatrix::copyFrom(const Matrix& src) {
-  copyFrom(src, HPPL_STREAM_1);
-  hl_stream_synchronize(HPPL_STREAM_1);
-}
-
-template <class T>
-void GpuSparseMatrix::copyFrom(int64_t* ids,
-                               int64_t* indices,
-                               T* data,
-                               hl_stream_t stream) {
-  CHECK_EQ(format_, SPARSE_CSR);
-  size_t nnz = 0;
-  for (size_t i = 0; i < height_; i++) {
-    int64_t id = ids[i];
-    nnz += indices[id + 1] - indices[id];
-  }
-
-  resize(height_,
-         width_,
-         nnz,
-         sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
-         format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; i++) {
-    int64_t id = ids[i];
-    size_t colNum = indices[id + 1] - indices[id];
-    rows_[i + 1] = rows_[i] + colNum;
-
-    T* row = data + indices[id];
-    copyRow(rows_[i], colNum, row);
-  }
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-  hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
-}
-
-void GpuSparseMatrix::setRow(size_t row,
-                             size_t colNum,
-                             const unsigned int* cols,
-                             const real* values) {
-  CHECK_EQ(format_, SPARSE_CSR);
-  if (NO_VALUE == valueType_) {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    CHECK(NULL == values);
-  } else {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    CHECK(NULL != values);
-  }
-  if (0 == row) {
-    rows_[row] = 0;
-  }
-  rows_[row + 1] = rows_[row] + colNum;
-
-  memcpy(cols_ + rows_[row], cols, sizeof(*cols) * colNum);
-  if (FLOAT_VALUE == valueType_) {
-    memcpy(value_ + rows_[row], values, sizeof(*values) * colNum);
-  }
-
-  if (height_ - 1 == row) {
-    sMatrix_->format = HL_SPARSE_CSR;
-    sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-    sMatrix_->rows = height_;
-    sMatrix_->cols = width_;
-    sMatrix_->nnz = elementCnt_;
-    hl_memcpy_csr_matrix(
-        sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT);
-  }
-}
-
-SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
-
-void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  CHECK_EQ(format_, SPARSE_CSC);
-  int nnz = sMatrix_->nnz;
-  if (memAlloc) {
-    matTrans = std::make_shared<GpuSparseMatrix>(
-        width_, height_, nnz, valueType_, format_, false);
-  } else {
-    CHECK(matTrans != nullptr);
-  }
-
-  CpuIVector rows(nnz);
-  CpuIVector cols(width_ + 1);
-  CpuIVector cols_full(nnz);
-  CpuVector value(nnz);
-  hl_stream_t stream = HPPL_STREAM_1;
-  hl_memcpy_from_csc_matrix(value.getData(),
-                            nnz,
-                            rows.getData(),
-                            nnz,
-                            cols.getData(),
-                            width_ + 1,
-                            sMatrix_.get(),
-                            stream);
-
-  hl_stream_synchronize(stream);
-
-  /*for every non zero number, get its column index*/
-  std::vector<Element> dataVec;
-  for (size_t i = 0; i < width_; i++) {
-    for (int j = cols.getData()[i]; j < cols.getData()[i + 1]; j++) {
-      cols_full.getData()[j] = i;
-    }
-  }
-
-  /*sort row index and column index by the ascending order*/
-  for (int i = 0; i < nnz; i++) {
-    dataVec.emplace_back(
-        rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
-  }
-  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
-    return a.row < b.row || (a.row == b.row && a.col < b.col);
-  });
-
-  /*get sorted data, row index, and col index, put them in the right place*/
-  cols.resize(height_ + 1);
-  rows.resize(nnz);
-  value.resize(nnz);
-
-  cols.getData()[0] = 0;
-  rows.getData()[0] = dataVec[0].col;
-  value.getData()[0] = dataVec[0].val;
-  for (int i = 1; i < nnz; i++) {
-    if (dataVec[i].row != dataVec[i - 1].row) {
-      for (int j = dataVec[i - 1].row + 1; j <= dataVec[i].row; j++) {
-        cols.getData()[j] = i;
-      }
-    }
-    rows.getData()[i] = dataVec[i].col;
-    value.getData()[i] = dataVec[i].val;
-  }
-  cols.getData()[height_] = nnz;
-
-  /*copy back from cpu*/
-  GpuSparseMatrixPtr dest =
-      std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
-  hl_memcpy_csc_matrix((dest->sMatrix_).get(),
-                       value.getData(),
-                       rows.getData(),
-                       cols.getData(),
-                       stream);
-  hl_stream_synchronize(stream);
-}
-
-void GpuSparseMatrix::mul(const GpuMatrix& a,
-                          const GpuMatrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(a.useGpu_ && b.useGpu_) << "type not match";
-  CHECK(!trans_) << "trans not supported";
-  real* A_d = (real*)a.getData();
-  real* B_d = (real*)b.getData();
-  hl_sparse_matrix_s C_d = sMatrix_.get();
-  hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
-
-  if (!a.trans_ && !b.trans_) {
-    CHECK(height_ == a.getHeight());
-    CHECK(width_ == b.getWidth());
-    CHECK(a.getWidth() == b.getHeight());
-  } else if (a.trans_ && !b.trans_) {
-    CHECK(height_ == a.getWidth());
-    CHECK(width_ == b.getWidth());
-    CHECK(a.getHeight() == b.getHeight());
-  } else if (!a.trans_ && b.trans_) {
-    CHECK(height_ == a.getHeight());
-    CHECK(width_ == b.getHeight());
-    CHECK(a.getWidth() == b.getWidth());
-  } else {
-    LOG(INFO) << "Not support";
-  }
-  int dimM = height_;
-  int dimN = width_;
-  int dimK = !b.trans_ ? b.getHeight() : b.getWidth();
-  hl_sparse_matrix_mul(
-      A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
-}
-
-void GpuSparseMatrix::mul(const Matrix& a,
-                          const Matrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
-  if (a_ptr && b_ptr) {
-    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-template <class T>
-void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
-  os << "\n: " << name << " [";
-  for (size_t i = 0; i < len; i++) {
-    os << a[i] << " ";
-  }
-  os << "]\n";
-}
-
-void GpuSparseMatrix::print(std::ostream& os) const {
-  if (format_ == SPARSE_CSC) {
-    int nnz = sMatrix_->nnz;
-    IVectorPtr rows = IVector::create(nnz, false);
-    IVectorPtr cols = IVector::create(width_ + 1, false);
-    VectorPtr value = Vector::create(nnz, false);
-    hl_stream_t stream = HPPL_STREAM_DEFAULT;
-    hl_memcpy_from_csc_matrix(value->getData(),
-                              value->getSize(),
-                              rows->getData(),
-                              rows->getSize(),
-                              cols->getData(),
-                              cols->getSize(),
-                              sMatrix_.get(),
-                              stream);
-    hl_stream_synchronize(stream);
-
-    printBuf(os, cols->getData(), width_ + 1, "col idx");
-    printBuf(os, rows->getData(), elementCnt_, "row idx");
-    printBuf(os, value->getData(), elementCnt_, "value");
-  }
-}
-
-void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
-  trans_ = src.trans_;
-  size_t nnz = src.getElementCnt();
-
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
-  // if have different value type, only copy rows and cols
-  SparseValueType vType =
-      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csr_matrix(sMatrix_.get(),
-                       vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(),
-                       src.getCols(),
-                       stream);
-
-  // restore type of sMatrix_
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-}
-
-void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
-  trans_ = src.trans_;
-  size_t nnz = src.getElementCnt();
-
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
-
-  // if have different value type, only copy rows and cols
-  SparseValueType vType =
-      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
-
-  sMatrix_->format = HL_SPARSE_CSC;
-  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csc_matrix(sMatrix_.get(),
-                       vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(),
-                       src.getCols(),
-                       stream);
-
-  // restore type of sMatrix_
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-}
-
-void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
-  CHECK(trans_ == src.trans_);
-  CHECK(format_ == src.getFormat());
-  resize(src.getHeight(),
-         src.getWidth(),
-         elementCnt_,
-         valueType_,
-         src.getFormat());
-
-  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
-  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
-
-  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    hl_memcpy_async(
-        getValue(), src.getValue(), sizeof(real) * elementCnt_, stream);
-  }
-  CHECK(getRows());
-  CHECK(src.getRows());
-
-  hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream);
-  hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream);
-}
-
-void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
-  if (format_ == SPARSE_CSR) {
-    copyFromCSR(src, stream);
-  } else {
-    copyFromCSC(src, stream);
-  }
-}
-
-void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
-  trans_ = src.trans_;
-  int* srcCols = src.getCols();
-  size_t nnz = std::count_if(srcCols,
-                             srcCols + src.getElementCnt(),
-                             [this](size_t n) { return n < this->width_; });
-  resize(height_, width_, nnz, valueType_, format_);
-
-  rows_[0] = 0;
-  size_t index = 0;
-  for (size_t r = 0; r < height_; ++r) {
-    for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
-      if (srcCols[i] < (int)width_) {
-        cols_[index] = srcCols[i];
-        if (valueType_ == FLOAT_VALUE) {
-          value_[index] = src.getValue()[i];
-        }
-        ++index;
-      }
-    }
-    rows_[r + 1] = index;
-  }
-  CHECK_EQ(index, nnz);
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csr_matrix(sMatrix_.get(),
-                       valueType_ == NO_VALUE ? NULL : value_,
-                       rows_,
-                       cols_,
-                       /*default stream = */ HPPL_STREAM_DEFAULT);
-}
-
-void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
-  trans_ = src.trans_;
-  size_t nnz = src.getCols()[width_] - src.getCols()[0];
-  resize(height_, width_, nnz, valueType_, format_);
-
-  cols_[0] = 0;
-  for (size_t i = 0; i < width_; i++) {
-    cols_[i + 1] = cols_[i] + (int)(src.getRowNum(i));
-  }
-  memcpy(rows_, src.getRows() + src.getCols()[0], sizeof(int) * nnz);
-  if (valueType_ == FLOAT_VALUE) {
-    memcpy(value_, src.getValue() + src.getCols()[0], sizeof(real) * nnz);
-  }
-
-  sMatrix_->format = HL_SPARSE_CSC;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csc_matrix(sMatrix_.get(),
-                       valueType_ == NO_VALUE ? NULL : value_,
-                       rows_,
-                       cols_,
-                       /*default stream = */ HPPL_STREAM_DEFAULT);
-}
-
-void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
-  if (format_ == SPARSE_CSR) {
-    trimFromCSR(src);
-  } else {
-    trimFromCSC(src);
-  }
-}
-
-void GpuSparseMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  hl_sparse_matrix_s A_d = sMatrix_.get();
-  hl_sparse_matrix_add_bias(A_d, b.getData(), scale);
-}
-
-void GpuSparseMatrix::add3(GpuMatrix* b) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b->getHeight());
-  CHECK(width_ == b->getWidth());
-  real* B_d = b->getData();
-  hl_sparse_matrix_s A_d = sMatrix_.get();
-  hl_sparse_matrix_add_dense(A_d, B_d, height_, width_, 1, 0);
-}
-
-void GpuSparseMatrix::add3(MatrixPtr b) {
-  if (dynamic_cast<GpuMatrix*>(b.get())) {
-    add3(dynamic_cast<GpuMatrix*>(b.get()));
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void GpuSparseMatrix::zeroMem() {
-  CHECK(valueType_ == FLOAT_VALUE);
-  real* value = getValue();
-  if (value == NULL) {
-    LOG(FATAL) << "value is nullptr";
-  }
-  hl_matrix_zero_mem(value, elementCnt_);
-}
-
-void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(format_, SPARSE_CSR) << "Only support SPARSE_CSR";
-
-  hl_sparse_matrix_top_k(maxVal.getData(),
-                         maxVal.getStride(),
-                         maxIds.getData(),
-                         sMatrix_.get(),
-                         beam,
-                         numSamples);
-#endif
-}
-
-template void GpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_non_value_t* data,
-                                        hl_stream_t stream);
-template void GpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_float_value_t* data,
-                                        hl_stream_t stream);
-}  // namespace paddle
diff --git a/paddle/legacy/math/SparseMatrix.h b/paddle/legacy/math/SparseMatrix.h
deleted file mode 100644
index 9181fa292..000000000
--- a/paddle/legacy/math/SparseMatrix.h
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <cstddef>
-#include "CpuSparseMatrix.h"
-#include "Matrix.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<_hl_sparse_matrix_s> hl_sparse_matrix_s_ptr;
-
-class GpuSparseMatrix : public Matrix {
- public:
-  MemoryHandlePtr sMemoryHandle_;
-  int* rows_;
-  int* cols_;
-  real* value_;
-  const char* end_; /* point to the end of sMemoryHandle_ */
-
-  hl_sparse_matrix_s_ptr sMatrix_;
-  SparseValueType valueType_;
-  SparseFormat format_;
-
- public:
-  GpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR,
-                  bool trans = false);
-
-  GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                  hl_sparse_matrix_s_ptr sMatrix,
-                  size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR,
-                  bool trans = false,
-                  MemoryHandlePtr sMemoryHandle = NULL);
-
-  GpuSparseMatrix(real* value,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans);
-
-  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans,
-                  MemoryHandlePtr sMemoryHandle);
-
- protected:
-  struct Element {
-    int row;
-    int col;
-    real val;
-    Element(int rowIn, int colIn, real valIn)
-        : row(rowIn), col(colIn), val(valIn) {}
-  };
-
- public:
-  ~GpuSparseMatrix() {}
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format);
-
-  void resize(size_t newHeight, size_t newWidth);
-
-  void sparseResizeCSR();
-
-  void sparseResizeCSC();
-
-  void resizeCSR(size_t newHeight,
-                 size_t newWidth,
-                 size_t newNnz,
-                 SparseValueType valueType);
-
-  void resizeCSC(size_t newHeight,
-                 size_t newWidth,
-                 size_t newNnz,
-                 SparseValueType valueType);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-  /// B = A , B.trans = !A.trans
-  MatrixPtr getTranspose();
-
-  /// B = A'
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-  void copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream);
-  void copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream);
-
-  void copyFrom(const IVector& src) { LOG(FATAL) << "not implemented"; }
-  void copyFrom(const IVector& src, hl_stream_t stream) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  template <class T>
-  void copyFrom(int64_t* ids, int64_t* indices, T* data, hl_stream_t stream);
-
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values);
-  SparseValueType getValueType() const;
-  SparseFormat getFormat() const { return format_; }
-
-  const int* getRowCols(size_t x) const { return cols_ + rows_[x]; }
-  const real* getRowValues(size_t x) const { return value_ + rows_[x]; }
-  size_t getColNum(size_t x) const { return rows_[x + 1] - rows_[x]; }
-  void print(std::ostream& os) const;
-
-  /**
-   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
-   */
-  void zeroMem();
-
-  /**
-   * @brief sparseMatrix += denseMatrix
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   *
-   * Only add value of same (row, col) index in dense matrix
-   * and do not use others values.
-   *
-   * @param[in]  b   dense matrix
-   */
-  void add3(GpuMatrix* b);
-  void add3(MatrixPtr b);
-
-  /**
-   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
-   *
-   * @param[in]  b      bias, dense matrix and height = 1
-   * @param[in]  scale  scale of b
-   */
-  void addBias(Matrix& b, real scale);
-
-  /**
-   * @brief return rows, which is gpu address
-   */
-  int* getRows() const {
-    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
-    return hl_sparse_matrix_get_rows(sMatrix_.get());
-  }
-
-  /**
-   * @brief return cols, which is gpu address
-   */
-  int* getCols() const {
-    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
-    return hl_sparse_matrix_get_cols(sMatrix_.get());
-  }
-
-  /**
-   * @brief return value, which is gpu address
-   */
-  real* getValue() const {
-    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
-    return hl_sparse_matrix_get_value(sMatrix_.get());
-  }
-
-  /**
-   * @brief return value_ of sparse matrix
-   *
-   * Some times CpuSparseMatrix maybe Matrix,
-   * if getValue, must dynamic_cast to CpuSparseMatrix,
-   * getData is convenient to get value
-   */
-  real* getData() { return getValue(); }
-  const real* getData() const { return getValue(); }
-
-  /**
-   * @brief  Get top k value of each row in sparse matrix.
-   *
-   * Store the value in maxVal and theirs index in maxIds.
-   * k = maxVal.width
-   *
-   * @param[out]  maxIds    index of top k
-   * @param[out]  maxVal    value of top k
-   */
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-
- protected:
-  void sparseResize();
-
-  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
-  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
-
- public:
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  void copyFrom(CpuSparseMatrix& src, hl_stream_t stream);
-  void copyFrom(GpuSparseMatrix& src, hl_stream_t stream);
-
-  void trimFrom(const CpuSparseMatrix& src);
-  void trimFromCSR(const CpuSparseMatrix& src);
-  void trimFromCSC(const CpuSparseMatrix& src);
-
-  // BaseMatrixT interface
- public:
-  bool isSparse() const { return true; }
-
- private:
-  using Matrix::mul;
-  using Matrix::copyFrom;
-  using Matrix::rowMax;
-  using Matrix::print;
-  using Matrix::subMatrix;
-};
-
-}  // namespace paddle
-
-#else
-
-#include "CpuSparseMatrix.h"
-
-namespace paddle {
-
-class GpuSparseMatrix : public Matrix {
- public:
-  GpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR,
-                  bool trans = false)
-      : Matrix(NULL, height, width, trans, false) {}
-
-  GpuSparseMatrix(real* value,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans)
-      : Matrix(NULL, height, width, trans, true) {}
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {}
-  void resize(size_t newHeight, size_t newWidth) {}
-  MatrixPtr getTranspose() { return nullptr; }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {}
-};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/SparseRowMatrix.cpp b/paddle/legacy/math/SparseRowMatrix.cpp
deleted file mode 100644
index 39bcdf229..000000000
--- a/paddle/legacy/math/SparseRowMatrix.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SparseRowMatrix.h"
-#include "CpuSparseMatrix.h"
-
-#include <algorithm>
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "SIMDFunctions.h"
-
-#include "paddle/legacy/utils/Thread.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
-
-void SparseRowCpuMatrix::init(size_t height, size_t width) {
-  height_ = height;
-  if (!indexDictHandle_) {
-    indexDictHandle_.reset(new IndexDict);
-    indexDictHandle_->globalIndices.assign(height, kUnusedId_);
-  }
-  localIndices_ = &indexDictHandle_->localIndices;
-  globalIndices_ = indexDictHandle_->globalIndices.data();
-}
-
-void SparseRowCpuMatrix::mul(CpuSparseMatrix* a,
-                             CpuMatrix* b,
-                             real scaleAB,
-                             real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
-}
-
-void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
-  LOG(FATAL) << "This should not be called";
-}
-
-void SparseRowCpuMatrix::zeroMem() {
-  apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); });
-  clearRows();
-}
-
-void SparseRowCpuMatrix::applyL1(real learningRate, real decayRate) {
-  apply([=](real* buf, size_t len) {
-    CpuVector value(0, nullptr);
-    value.subVecFrom(buf, 0, len);
-    value.applyL1(learningRate, decayRate);
-  });
-}
-
-void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
-                                   IVector& t0,
-                                   real learningRate,
-                                   int currentTime,
-                                   real decayRate,
-                                   bool useL1,
-                                   bool fini) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-
-  // t0 and value are vectors
-  CHECK_EQ(t0.getSize(), this->height_);
-  CHECK_EQ(value.width_, this->height_ * this->width_);
-
-  if (decayRate == 0.0f) {
-    if (fini) {
-      return;
-    }
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] -= learningRate * g[j];
-      }
-    }
-    return;
-  }  // else
-
-  if (useL1) {  // L1 decay
-    if (fini) {
-      for (size_t i = 0; i < this->height_; ++i) {
-        real* v = value.rowBuf(i);
-        int* t = t0.getData() + i;
-        if (t[0] < currentTime) {
-          // W(t0) -> W(t+1)
-          int tDiff = currentTime - t[0];
-          real delta = tDiff * learningRate * decayRate;
-          simd::decayL1(v, v, delta, this->width_);
-        }
-      }
-      return;
-    }  // else
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      int* t = t0.getData() + localIndices[i];
-      if (t[0] < currentTime) {
-        // W(t0) -> W(t)
-        int tDiff = currentTime - t[0];
-        real delta = tDiff * learningRate * decayRate;
-        simd::decayL1(v, v, delta, this->width_);
-      }
-
-      // W(t) -> W(t+1)
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] -= learningRate * g[j];
-      }
-      simd::decayL1(v, v, learningRate * decayRate, this->width_);
-
-      // state update to t+1
-      t[0] = currentTime + 1;
-    }
-
-  } else {  // L2 decay
-    if (fini) {
-      for (size_t i = 0; i < this->height_; ++i) {
-        real* v = value.rowBuf(i);
-        int* t = t0.getData() + i;
-        if (t[0] < currentTime) {
-          // W(t0) -> W(t+1)
-          int tDiff = currentTime - t[0];
-          real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
-          for (size_t j = 0; j < this->width_; ++j) {
-            v[j] *= recip;
-          }
-        }
-      }
-      return;
-    }  // else
-
-    real recipDecay = 1.0f / (1.0f + learningRate * decayRate);
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      int* t = t0.getData() + localIndices[i];
-      if (t[0] < currentTime) {
-        // W(t0) -> W(t)
-        int tDiff = currentTime - t[0];
-        real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
-        for (size_t j = 0; j < this->width_; ++j) {
-          v[j] *= recip;
-        }
-      }
-
-      // W(t) -> W(t+1)
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] = recipDecay * (v[j] - learningRate * g[j]);
-      }
-
-      // state update to t+1
-      t[0] = currentTime + 1;
-    }
-  }
-}
-
-void SparseRowCpuMatrix::addTo(BaseMatrix& dest,
-                               std::vector<uint32_t>& ids,
-                               size_t tid,
-                               size_t numThreads) {
-  CHECK(!dest.useGpu_);
-  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
-
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_);
-      ids.push_back(id);
-    }
-  }
-}
-
-void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
-                               size_t tid,
-                               size_t numThreads) {
-  CHECK(!dest.useGpu_);
-  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
-
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      dest.checkIndex(id);
-      simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);
-    }
-  }
-}
-
-void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      memset(this->getLocalRow(i), 0, this->width_ * sizeof(real));
-    }
-  }
-}
-
-void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a,
-                                     CpuMatrix* b,
-                                     real scaleAB,
-                                     real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-      a, b, this, scaleAB, scaleT);
-}
-
-void CacheRowCpuMatrix::mul(CpuSparseMatrix* a,
-                            CpuMatrix* b,
-                            real scaleAB,
-                            real scaleT) {
-  CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < len; i++) {
-    CHECK_LT(*(ids + i), this->getHeight())
-        << "id:" << *(ids + i) << "Height:" << this->getHeight()
-        << "sparse id value exceeds the max input dimension, "
-        << "it could be caused invalid input data samples";
-  }
-  localIndices.insert(localIndices.end(), ids, ids + len);
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
-  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
-  CHECK(mat) << "only support sparse matrix";
-  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
-          mat->getElementCnt());
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  size_t numSamples = ids->getSize();
-  int* index = ids->getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-
-    unsigned int id = (unsigned int)index[i];
-    CHECK_LT(id, this->getHeight())
-        << "id:" << id << "Height:" << this->getHeight()
-        << "sparse id value exceeds the max input dimension, "
-        << "it could be caused invalid input data samples";
-    localIndices.push_back(id);
-  }
-}
-
-void SparsePrefetchRowCpuMatrix::setupIndices() {
-  auto& localIndices = indexDictHandle_->localIndices;
-  uniqueIds(localIndices);
-  // for each sparse row
-  for (size_t id = 0; id < localIndices.size(); ++id) {
-    globalIndices_[localIndices[id]] = id;  // sparse row -> local id
-  }
-  checkStoreSize();
-}
-
-void SparseRowCpuMatrix::checkIndices() {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    CHECK_EQ(globalIndices_[localIndices[i]], i);
-  }
-  checkStoreSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/SparseRowMatrix.h b/paddle/legacy/math/SparseRowMatrix.h
deleted file mode 100644
index e206747a4..000000000
--- a/paddle/legacy/math/SparseRowMatrix.h
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <gflags/gflags.h>
-#include <string.h>
-#include <algorithm>
-#include "Matrix.h"
-#include "RowBuffer.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * Sparse Row
- */
-class SparseRowCpuMatrix : public CpuMatrix {
- public:
-  struct IndexDict {
-    // In the following, global id means the row id in the original matrix.
-    // Local id means the row id in the local storage which only contains
-    // the sparse rows.
-    std::vector<unsigned int> localIndices;   // local id -> global id
-    std::vector<unsigned int> globalIndices;  // global id -> local id
-  };
-  typedef std::shared_ptr<IndexDict> IndexDictPtr;
-
-  /// heightStore is max number of rows of the sparse matrix.
-  SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                     size_t height,
-                     size_t width,
-                     IndexDictPtr indexDictHandle = nullptr,
-                     bool trans = false)
-      : CpuMatrix(nullptr, height, width, trans),
-        indexDictHandle_(indexDictHandle) {
-    init(height, width);
-    buf_.reset(new RowBuffer(dataHandle, width));
-  }
-
-  virtual ~SparseRowCpuMatrix() {}
-
- public:
-  /**
-   *  Get the row buf
-   *
-   *  @param row row id in the original matrix
-   */
-  real* getRow(size_t row) {
-    CHECK_NE(globalIndices_[row], kUnusedId_);
-    return getLocalRow(globalIndices_[row]);
-  }
-
-  /**
-   *  Get the row buf
-   *
-   *  @param row row id in local storage
-   */
-  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
-
-  /**
-   *  reserve the storage for rows according to current size of
-   * indexDictHandle.
-   *
-   *  This is only used when SparseRowCpuMatrix is constructed with
-   *  indexDictHandle.
-   */
-  void reserveStore() { buf_->resize(localIndices_->size()); }
-
-  // row is the row id in the original matrix
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  /**
-   * Fill data according to row indexs added, setup indices inside.
-   *
-   * *src* and *size* are data and size of normal dense CpuMatrix.
-   */
-  virtual void copyFrom(const real* src, size_t size);
-  virtual void zeroMem();
-
-  /**
-   * apply L1 to all sparse rows, should be apply after indices ready.
-   */
-  virtual void applyL1(real learningRate, real decayRate);
-
-  void clearIndices() { clearRows(); }
-  void zeroMemThread(size_t tid, size_t numThreads);
-
-  /**
-   *  value -= grad * learningRate,  this is gradient.
-   *
-   * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
-   *
-   * t0 is a int vector used by L1/L2 decay, size = height of parameter
-   * matrix,
-   * store the time that each weight row last updated.
-   *
-   * Time is batchId, currentTime is current batchId.
-   *
-   * While pass finished, caller should call this func one more time
-   *  with (fini=true) to let weight decay catch up current time.
-   */
-  void sgdUpdate(BaseMatrix& value,
-                 IVector& t0,
-                 real learningRate,
-                 int currentTime,
-                 real decayRate,
-                 bool useL1,
-                 bool fini = false);
-
-  /**
-   *  merge rows in *this* to *dest* for designated thread
-   *
-   *  values add to *dest* matrix
-   *
-   *  ids occured in *this* append to *ids*
-   *  filtered by  (id % numThreads == tid)
-   */
-  void addTo(BaseMatrix& dest,
-             std::vector<uint32_t>& ids,
-             size_t tid,
-             size_t numThreads);
-
-  /**
-   *  the second version addTo(), *dest* is a SparseRowCpuMatrix.
-   *
-   *  The dest's indices should be setup already, addTo() will
-   *  check src ids is exist in dest's indices.
-   */
-  void addTo(SparseRowCpuMatrix& dest, size_t tid, size_t numThreads);
-
-  const IndexDictPtr& getIndexDictHandle() const { return indexDictHandle_; }
-
-  /**
-   *  check all local and global indices consistency
-   */
-  void checkIndices();
-  /**
-   *  check whether row *i* exist in indices
-   */
-  void checkIndex(size_t i) {
-    size_t localId = globalIndices_[i];
-    CHECK_LT(localId, localIndices_->size());
-    CHECK_EQ((*localIndices_)[localId], i);
-  }
-
-  std::vector<unsigned int>& getLocalIndices() const {
-    return indexDictHandle_->localIndices;
-  }
-
- protected:
-  template <typename Func>
-  void apply(Func f) {
-    f(buf_->data(), localIndices_->size() * width_);
-  }
-
-  void init(size_t height, size_t width);
-
-  /// clear row indices.
-  void clearRows() {
-    for (auto id : *localIndices_) {
-      globalIndices_[id] = kUnusedId_;
-    }
-    localIndices_->clear();
-    buf_->clear();
-  }
-
-  inline void checkStoreSize() {
-    if (buf_->isAutoGrowth()) {
-      if (buf_->getRowCount() > 0.5 * height_) {
-        LOG(WARNING) << "There are more than 0.5*height ("
-                     << localIndices_->size() << ") rows are used for sparse "
-                     << "update, which is not efficient. Considering not use "
-                     << "sparse_update.";
-      }
-    } else {
-      CHECK_LE(localIndices_->size(), buf_->getRowCount());
-    }
-  }
-
-  std::unique_ptr<RowBuffer> buf_;
-  IndexDictPtr indexDictHandle_;
-  std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
-  unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
-  static const unsigned int kUnusedId_;
-};
-
-class SyncThreadPool;
-
-/// For prefetching parameters from remote Parameter server
-class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
- public:
-  SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                             size_t height,
-                             size_t width,
-                             IndexDictPtr indexDictHandle = nullptr,
-                             SyncThreadPool* pool = nullptr,
-                             bool trans = false)
-      : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
-        pool_(pool) {}
-
-  /**
-   * Extract feature ids from *input*, to fill row indexs.
-   *
-   * *input* must be sparse matrix.
-   *
-   * Can call many times before setup.
-   */
-  void addRows(MatrixPtr input);
-  void addRows(IVectorPtr ids);
-
-  /**
-   * setup global indices of SparseRowMatrix after finish add rows.
-   */
-  void setupIndices();
-
- protected:
-  void addRows(const unsigned int* ids, size_t len);
-  SyncThreadPool* pool_;
-};
-
-class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
- public:
-  SparseAutoGrowRowCpuMatrix(size_t height,
-                             size_t width,
-                             IndexDictPtr indexDictHandle = nullptr,
-                             bool trans = false)
-      : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
-
-  real* getRow(size_t row) {
-    auto id = globalIndices_[row];
-    if (id == kUnusedId_) {
-      id = globalIndices_[row] = localIndices_->size();
-      localIndices_->push_back(row);
-      checkStoreSize();
-    }
-    return getLocalRow(id);
-  }
-
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-};
-
-class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
- public:
-  CacheRowCpuMatrix(size_t height,
-                    size_t width,
-                    IndexDictPtr indexDictHandle = nullptr,
-                    bool trans = false)
-      : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
-        sourceData_(nullptr) {}
-
-  void setSourceData(CpuVectorPtr sourceVec) {
-    sourceDataVec_ = sourceVec;
-    sourceData_ = sourceVec->getData();
-  }
-
-  real* getRow(size_t row) {
-    auto id = globalIndices_[row];
-    if (id == kUnusedId_) {
-      id = globalIndices_[row] = localIndices_->size();
-      localIndices_->push_back(row);
-      checkStoreSize();
-      memcpy(
-          getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_);
-    }
-    return getLocalRow(id);
-  }
-
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
- public:
-  CpuVectorPtr sourceDataVec_;
-  real* sourceData_;
-};
-
-/**
- * Sparse Row Ids Matrix.
- *
- * mostly same as CpuMatrix, but maintain sparse row ids occured,
- * ids are hashed by worker thread id.
- */
-class SparseRowIdsCpuMatrix : public CpuMatrix {
- public:
-  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
-                        size_t height,
-                        size_t width,
-                        bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {}
-
-  void setNumOfThreads(size_t numOfThreads) { idsArray_.resize(numOfThreads); }
-
-  std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
-
- private:
-  std::vector<std::vector<uint32_t>> idsArray_;
-};
-
-}  // namespace paddle
-
-#else
-namespace paddle {
-
-class SparseRowCpuMatrix : public CpuMatrix {
- public:
-  void reserveStore() {}
-  void clearIndices() {}
-};
-
-class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
- public:
-  void setupIndices() {}
-  void addRows(MatrixPtr input) {}
-  void addRows(IVectorPtr ids) {}
-};
-
-class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
-class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
-class SparseRowIdsCpuMatrix : public CpuMatrix {};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/Storage.cpp b/paddle/legacy/math/Storage.cpp
deleted file mode 100644
index 65d53aeaa..000000000
--- a/paddle/legacy/math/Storage.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Storage.h"
-#include "Allocator.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-DEFINE_int32(pool_limit_size,
-             536870912,
-             "maximum memory size managed by a memory pool, default is 512M");
-#else
-DEFINE_int32(pool_limit_size, 0, "default is 0");
-#endif
-
-namespace paddle {
-
-// Initialization StorageEngine singleton.
-// Other modules may rely on storage management,
-// so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
-                                          std::numeric_limits<int>::max());
-
-StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
-
-StorageEngine::~StorageEngine() {
-  delete cpuAllocator_;
-  for (auto it : gpuAllocator_) {
-    delete it;
-  }
-}
-
-StorageEngine* StorageEngine::singleton() {
-  static StorageEngine storage;
-  return &storage;
-}
-
-PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
-  {
-    // if gpuAllocator_ has been constructed
-    ReadLockGuard guard(lock_);
-    if (deviceId < static_cast<int>(gpuAllocator_.size()) &&
-        (gpuAllocator_[deviceId] != nullptr)) {
-      return gpuAllocator_[deviceId];
-    }
-  }
-
-  {
-    // Construct gpuAllocator_
-    std::lock_guard<RWLock> guard(lock_);
-    if (deviceId >= static_cast<int>(gpuAllocator_.size())) {
-      gpuAllocator_.resize(deviceId + 1);
-    }
-    if (gpuAllocator_[deviceId] == nullptr) {
-      std::string name =
-          "gpu" + str::to_string(deviceId) + std::string("_pool");
-      gpuAllocator_[deviceId] =
-          new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
-    }
-    return gpuAllocator_[deviceId];
-  }
-}
-
-PoolAllocator* StorageEngine::getCpuAllocator() {
-  {
-    // if cpuAllocator_ has been constructed
-    ReadLockGuard guard(lock_);
-    if (cpuAllocator_ != nullptr) {
-      return cpuAllocator_;
-    }
-  }
-
-  {
-    // Construct cpuAllocator_
-    std::lock_guard<RWLock> guard(lock_);
-    if (cpuAllocator_ == nullptr) {
-      if (FLAGS_use_gpu) {
-        cpuAllocator_ = new PoolAllocator(
-            new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
-      } else {
-        cpuAllocator_ = new PoolAllocator(
-            new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
-      }
-    }
-    return cpuAllocator_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/Storage.h b/paddle/legacy/math/Storage.h
deleted file mode 100644
index bd22dde2c..000000000
--- a/paddle/legacy/math/Storage.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>
-#include <vector>
-#include "PoolAllocator.h"
-#include "paddle/legacy/utils/Locks.h"
-
-namespace paddle {
-
-/**
- * @brief Storage manager for multiple devices.
- */
-class StorageEngine {
- public:
-  /**
-   * @return Storage singleton
-   */
-  static StorageEngine* singleton();
-
-  /**
-   * @return return one gpu allocator by deviceId
-   */
-  PoolAllocator* getGpuAllocator(int deviceId);
-
-  /**
-   * @return return cpu allocator
-   */
-  PoolAllocator* getCpuAllocator();
-
- protected:
-  StorageEngine();
-  ~StorageEngine();
-  RWLock lock_;
-  std::vector<PoolAllocator*> gpuAllocator_;
-  PoolAllocator* cpuAllocator_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/TensorApply.h b/paddle/legacy/math/TensorApply.h
deleted file mode 100644
index 8b642047b..000000000
--- a/paddle/legacy/math/TensorApply.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-
-/**
- * \brief The tensor evaluator classes.
- */
-template <typename Derived, class T>
-class TensorApply {
- public:
-  explicit INLINE TensorApply(const Derived& p)
-      : data_(p.data_),
-        stride_(p.stride_),
-        height_(p.height_),
-        width_(p.width_),
-        useGpu_(p.useGpu_) {}
-
-  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
-  INLINE T apply(int index) const { return data_[index]; }
-  INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
-  INLINE T& applyRef(int index) { return data_[index]; }
-
-  INLINE size_t getWidth() const { return width_; }
-  INLINE size_t getHeight() const { return height_; }
-  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-  INLINE bool useGpu() const { return useGpu_; }
-
-  T* data_;
-  size_t stride_;
-  size_t height_;
-  size_t width_;
-  bool useGpu_;
-};
-
-/**
- * \brief The tensor evaluator classes.
- * evaluator for rvalues
- */
-template <typename Derived, class T>
-class TensorApply<const Derived, T> {
- public:
-  explicit INLINE TensorApply(const Derived& p)
-      : data_(p.data_),
-        stride_(p.stride_),
-        height_(p.height_),
-        width_(p.width_),
-        useGpu_(p.useGpu_) {}
-
-  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
-  INLINE T apply(int index) const { return data_[index]; }
-
-  INLINE size_t getWidth() const { return width_; }
-  INLINE size_t getHeight() const { return height_; }
-  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-  INLINE bool useGpu() const { return useGpu_; }
-
-  const T* data_;
-  size_t stride_;
-  size_t height_;
-  size_t width_;
-  bool useGpu_;
-};
-
-template <typename Derived, class T>
-class TensorApply<const TensorExpression<Derived, T>, T> {
- public:
-  explicit TensorApply(const TensorExpression<Derived, T>& expr)
-      : expr_(expr.derived()) {}
-
-  INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
-  INLINE T apply(int index) const { return expr_.apply(index); }
-
-  INLINE size_t getWidth() const { return expr_.getWidth(); }
-  INLINE size_t getHeight() const { return expr_.getHeight(); }
-  INLINE bool isContiguous() const { return expr_.isContiguous(); }
-  INLINE bool useGpu() const { return expr_.useGpu(); }
-
-  TensorApply<const Derived, T> expr_;
-};
-
-/**
- * \brief The unary expression evaluator classes.
- */
-template <class OP, typename ArgType, class T>
-class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
- public:
-  explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
-      : op_(expr.op_), expr_(expr.expr_) {}
-
-  INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
-  INLINE T apply(int index) const { return op_(expr_.apply(index)); }
-
-  INLINE size_t getWidth() const { return expr_.getWidth(); }
-  INLINE size_t getHeight() const { return expr_.getHeight(); }
-  INLINE bool isContiguous() const { return expr_.isContiguous(); }
-  INLINE bool useGpu() const { return expr_.useGpu(); }
-
-  const OP op_;
-  TensorApply<ArgType, T> expr_;
-};
-
-/**
- * \brief The binary expression evaluator classes.
- */
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
- public:
-  explicit INLINE TensorApply(
-      const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
-      : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-#endif
-  }
-
-  INLINE T apply(int i, int j) const {
-    return op_(lhs_.apply(i, j), rhs_.apply(i, j));
-  }
-  INLINE T apply(int index) const {
-    return op_(lhs_.apply(index), rhs_.apply(index));
-  }
-
-  INLINE size_t getWidth() const { return lhs_.getWidth(); }
-  INLINE size_t getHeight() const { return rhs_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return lhs_.isContiguous() && rhs_.isContiguous();
-  }
-  INLINE bool useGpu() const { return lhs_.useGpu(); }
-
-  const OP op_;
-  TensorApply<LhsType, T> lhs_;
-  TensorApply<RhsType, T> rhs_;
-};
-
-/**
- * \brief The ternary expression evaluator classes.
- */
-template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
-class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
- public:
-  explicit INLINE TensorApply(
-      const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
-      : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
-    CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
-    CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
-    CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
-    CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
-    CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
-#endif
-  }
-
-  INLINE T apply(int i, int j) const {
-    return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
-  }
-  INLINE T apply(int index) const {
-    return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
-  }
-
-  INLINE size_t getWidth() const { return expr1_.getWidth(); }
-  INLINE size_t getHeight() const { return expr1_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return expr1_.isContiguous() && expr2_.isContiguous() &&
-           expr3_.isContiguous();
-  }
-  INLINE bool useGpu() const { return expr1_.useGpu(); }
-
-  TensorApply<ArgType1, T> expr1_;
-  TensorApply<ArgType2, T> expr2_;
-  TensorApply<ArgType3, T> expr3_;
-};
-
-/**
- * \brief The const expression evaluator classes.
- */
-template <class OP, typename ArgType, class T>
-class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
- public:
-  explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
-      : op_(expr.op_), expr_(expr.expr_) {}
-
-  INLINE T apply(int i, int j) const { return op_(i, j); }
-  INLINE T apply(int index) const { return op_(index); }
-
-  INLINE size_t getWidth() const { return expr_.getWidth(); }
-  INLINE size_t getHeight() const { return expr_.getHeight(); }
-  INLINE bool isContiguous() const { return true; }
-  INLINE bool useGpu() const { return expr_.useGpu(); }
-
-  const OP op_;
-  TensorApply<ArgType, T> expr_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/TensorAssign.h b/paddle/legacy/math/TensorAssign.h
deleted file mode 100644
index efbfce6c4..000000000
--- a/paddle/legacy/math/TensorAssign.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief Tensor Assign Expression(return by lazyAssign,
- * and evaluated by AssignEvaluate)
- */
-template <typename LhsType, typename RhsType, class T>
-class TensorAssignOp {
- public:
-  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
-      : lhs_(lhs), rhs_(rhs) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-#endif
-  }
-
-  INLINE void apply(const int i, const int j) {
-    lhs_.applyRef(i, j) = rhs_.apply(i, j);
-  }
-  INLINE void apply(const int index) {
-    lhs_.applyRef(index) = rhs_.apply(index);
-  }
-
-  INLINE size_t getWidth() const { return lhs_.getWidth(); }
-  INLINE size_t getHeight() const { return rhs_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return lhs_.isContiguous() && rhs_.isContiguous();
-  }
-  INLINE bool useGpu() const { return lhs_.useGpu(); }
-
- private:
-  TensorApply<LhsType, T> lhs_;
-  TensorApply<const RhsType, T> rhs_;
-};
-
-template <typename Assign, typename... AssignOp>
-void AssignCpuEvaluate(int height,
-                       int width,
-                       bool isContiguous,
-                       Assign&& assign,
-                       AssignOp&&... args) {
-  if (isContiguous) {
-    int size = height * width;
-    for (int index = 0; index < size; index++) {
-      assign.apply(index);
-      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
-    }
-  } else {
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        assign.apply(i, j);
-        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename Assign, typename... AssignOp>
-__global__ void AssignGpuEvaluate1(const int border,
-                                   Assign assign,
-                                   AssignOp... args) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    assign.apply(idx);
-    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
-  }
-}
-
-template <typename Assign, typename... AssignOp>
-__global__ void AssignGpuEvaluate2(const int height,
-                                   const int width,
-                                   Assign assign,
-                                   AssignOp... args) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
-      assign.apply(i, j);
-      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
-    }
-  }
-}
-#endif
-
-/**
- * \brief Evaluate one or more TensorAssignOp objects.
- *
- * \note At least one assignment expression is required
- */
-template <typename Assign, typename... AssignOp>
-void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
-  const bool useGpu_ = assign.useGpu();
-  bool isContiguous_ = assign.isContiguous();
-  const size_t height = assign.getHeight();
-  const size_t width = assign.getWidth();
-
-  const int packSize = sizeof...(args);
-  const bool packUseGpu[] = {((args)).useGpu()...};
-  const bool packIsContiguous[] = {((args)).isContiguous()...};
-  const size_t packHeight[] = {((args)).getHeight()...};
-  const size_t packWidth[] = {((args)).getWidth()...};
-
-  for (int i = 0; i < packSize; i++) {
-    CHECK_EQ(useGpu_, packUseGpu[i]);
-    CHECK_EQ(height, packHeight[i]);
-    CHECK_EQ(width, packWidth[i]);
-    isContiguous_ = isContiguous_ && packIsContiguous[i];
-  }
-
-  if (useGpu_) {
-#ifdef __NVCC__
-    if (isContiguous_) {
-      int size = height * width;
-      int blockSize = size <= 1024 ? size : 1024;
-      int gridSize = (size + 1024 - 1) / 1024;
-      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-          size, assign, args...);
-    } else {
-      int blockSizeY = std::min(32, (int)height);
-      int blockSizeX = (32 / blockSizeY) * 32;
-      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
-      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
-      dim3 threads(blockSizeX, blockSizeY);
-      dim3 grid(gridSizeX, gridSizeY);
-      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          height, width, assign, args...);
-    }
-
-    CHECK_SYNC("AssignEvaluate failed");
-#endif
-  } else {
-    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/TensorEvaluate.h b/paddle/legacy/math/TensorEvaluate.h
deleted file mode 100644
index 3029dd35f..000000000
--- a/paddle/legacy/math/TensorEvaluate.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "hl_base.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief The tensor cpu evaluate api.
- */
-template <class T, typename LeftType, typename RightType>
-inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
-  TensorApply<LeftType, T> lhs_(lhs);
-  TensorApply<const RightType, T> rhs_(rhs);
-  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-
-  int height = lhs_.getHeight();
-  int width = lhs_.getWidth();
-  if (lhs_.isContiguous() && rhs_.isContiguous()) {
-    int size = height * width;
-    for (int index = 0; index < size; index++) {
-      lhs_.applyRef(index) = rhs_.apply(index);
-    }
-  } else {
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        lhs_.applyRef(i, j) = rhs_.apply(i, j);
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename LeftType, typename RightType>
-__global__ void TensorElementWiseOp(LeftType lhs,
-                                    RightType rhs,
-                                    const int border) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    lhs.applyRef(idx) = rhs.apply(idx);
-  }
-}
-
-template <typename LeftType, typename RightType>
-__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
-      lhs.applyRef(i, j) = rhs.apply(i, j);
-    }
-  }
-}
-
-/**
- * \brief The tensor gpu evaluate api.
- */
-template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
-  TensorApply<LeftType, T> lhs_(lhs);
-  TensorApply<const RightType, T> rhs_(rhs);
-  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-
-  int dimM = lhs_.getHeight();
-  int dimN = lhs_.getWidth();
-
-  if (lhs_.isContiguous() && rhs_.isContiguous()) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-        lhs_, rhs_, size);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
-  }
-
-  CHECK_SYNC("TensorGpuApply failed");
-}
-#else
-template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {
-  LOG(FATAL) << "Since it is gcc compiled, "
-                "this calculation does not support GPU implementation.";
-}
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/TensorExpression.h b/paddle/legacy/math/TensorExpression.h
deleted file mode 100644
index 1c6cf0783..000000000
--- a/paddle/legacy/math/TensorExpression.h
+++ /dev/null
@@ -1,446 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-#include <cstddef>
-#include "hl_tensor_ops.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-template <class OP, typename ExprType, class T>
-class TensorConstant;
-template <class OP, typename ExprType, class T>
-class TensorUnaryOp;
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorBinaryOp;
-template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
-class TensorTernaryOp;
-
-template <typename LhsType, typename RhsType, class T>
-class TensorAssignOp;
-
-/**
- * \brief Tensor base class.
- *
- * This is the base class of all Tensor and Expression class.
- */
-template <typename Derived, class T>
-class TensorExpression {
- public:
-  /**
-   * Element wise unary expression.
-   */
-  template <typename UnaryOp>
-  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
-      const UnaryOp& op) const {
-    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
-  }
-
-  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
-      T p) const {
-    return unaryExpression(hppl::unary::add_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
-      T p) const {
-    return unaryExpression(hppl::unary::sub_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
-      T p) const {
-    return unaryExpression(hppl::unary::mul_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
-      T p) const {
-    return unaryExpression(hppl::unary::div_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
-    return unaryExpression(hppl::unary::neg<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
-    return unaryExpression(hppl::unary::exp_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
-    return unaryExpression(hppl::unary::log_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
-    return unaryExpression(hppl::unary::sqrt_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
-    return unaryExpression(hppl::unary::square<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
-      const {
-    return unaryExpression(hppl::unary::reciprocal<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
-    return unaryExpression(hppl::unary::abs<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
-    return unaryExpression(hppl::unary::sign<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
-    return unaryExpression(hppl::unary::pow_op<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
-    return unaryExpression(hppl::unary::min<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
-    return unaryExpression(hppl::unary::max<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_eq<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_ne<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_le<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_lt<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_ge<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_gt<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
-      T p) const {
-    return unaryExpression(hppl::unary::and_op<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
-      T p) const {
-    return unaryExpression(hppl::unary::or_op<T>(p));
-  }
-
-  /**
-   * Element wise binary expression.
-   */
-  template <typename BinaryOp, typename ExpressionType>
-  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
-  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
-    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
-        op, derived(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator==(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator!=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_le<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator<=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator<(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator>=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator>(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::and_op<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator&&(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::and_op<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::or_op<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator||(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::or_op<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::add<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator+(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::add<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::sub<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator-(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::sub<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::mul<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator*(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::mul<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::div<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator/(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::div<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::min<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  min(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::min<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::max<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  max(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::max<T>(), expr);
-  }
-
-  /**
-   * Element wise ternary expression.
-   *
-   * ternary conditional operator(?: operator).
-   * The conditional expression returns one of two values depending on
-   * the result of derived expression.
-   * If derived expression evaluates to true, then expression1 is evaluated.
-   * If derived expression evaluates to false, then expression2 is evaluated.
-   */
-  template <typename ExprType1, typename ExprType2>
-  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
-  condition(const ExprType1& expr1, const ExprType2& expr2) const {
-    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
-        derived(), expr1, expr2);
-  }
-
-  template <typename ExprType>
-  const TensorTernaryOp<
-      const Derived,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      const ExprType,
-      T>
-  condition(T p, const ExprType& expr) const {
-    return condition(constant(p), expr);
-  }
-
-  template <typename ExprType>
-  const TensorTernaryOp<
-      const Derived,
-      const ExprType,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      T>
-  condition(const ExprType& expr, T p) const {
-    return condition(expr, constant(p));
-  }
-
-  const TensorTernaryOp<
-      const Derived,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      T>
-  condition(T p1, T p2) const {
-    return condition(constant(p1), constant(p2));
-  }
-
-  /**
-   * return a TensorConstant. A TensorConstant object hold a constant value.
-   */
-  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
-      T p) const {
-    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
-        hppl::unary::constant<T>(p), derived());
-  }
-
-  /**
-   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
-   * TensorAssignOp objects.
-   */
-  template <typename ExpressionType>
-  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
-      const ExpressionType& expr) const {
-    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
-  }
-
- protected:
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
-};
-
-/**
- * \brief Unary Operator Expression
- */
-template <class OP, typename ExprType, class T>
-class TensorUnaryOp
-    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
- public:
-  explicit TensorUnaryOp(const OP op, const ExprType& expr)
-      : op_(op), expr_(expr) {}
-
-  const OP op_;
-  const ExprType expr_;
-};
-
-/**
- * \brief Binary Operator Expression
- */
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorBinaryOp
-    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
- public:
-  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
-      : op_(op), lhs_(lhs), rhs_(rhs) {}
-
-  const OP op_;
-  const LhsType lhs_;
-  const RhsType rhs_;
-};
-
-/**
- * \brief Ternary Operator Expression
- */
-template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
-class TensorTernaryOp : public TensorExpression<
-                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
-                            T> {
- public:
-  explicit TensorTernaryOp(const ExprType1& expr1,
-                           const ExprType2& expr2,
-                           const ExprType3& expr3)
-      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
-
-  const ExprType1 expr1_;
-  const ExprType2 expr2_;
-  const ExprType3 expr3_;
-};
-
-/**
- * \brief Constant Expression
- */
-template <class OP, typename ExprType, class T>
-class TensorConstant
-    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
- public:
-  explicit TensorConstant(const OP op, const ExprType& expr)
-      : op_(op), expr_(expr) {}
-
-  const OP op_;
-  const ExprType expr_;
-};
-
-/**
- * \brief operator+ overload
- * \return a unary operator expression
- */
-template <typename Derived, class T>
-const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
-    T p, const TensorExpression<Derived, T>& expr) {
-  return expr + p;
-}
-
-/**
- * \brief operator* overload
- * \return a unary operator expression
- */
-template <typename Derived, class T>
-const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
-    T p, const TensorExpression<Derived, T>& expr) {
-  return expr * p;
-}
-
-}  // namespace paddle
-
-#include "TensorApply.h"
-#include "TensorEvaluate.h"
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.cu b/paddle/legacy/math/TrainingAlgorithmOp.cu
deleted file mode 100644
index 9e1eaa0f4..000000000
--- a/paddle/legacy/math/TrainingAlgorithmOp.cu
+++ /dev/null
@@ -1,356 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BaseMatrix.h"
-#include "TrainingAlgorithmOp.h"
-#include "paddle/legacy/utils/Logging.h"
-
-#if __cplusplus > 199711L
-
-#include "TensorAssign.h"
-
-namespace paddle {
-
-void sparseMomentumApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& momU,
-                         BaseMatrix& momV,
-                         real alpha,
-                         real beta,
-                         real gamma,
-                         real tau,
-                         real learningRate) {
-  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
-  auto expr2 =
-      momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
-  auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
-                                ((real)1 / beta) * momV);
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-void adadeltaApply(BaseMatrix& value,
-                   BaseMatrix& grad,
-                   BaseMatrix& mom,
-                   BaseMatrix& accum,
-                   BaseMatrix& accum_update,
-                   BaseMatrix& lr,
-                   real rou,
-                   real epsilon,
-                   real learningRate,
-                   real momentum,
-                   real decayRate) {
-  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
-  auto expr2 =
-      lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
-  auto expr3 = accum_update.lazyAssign(rou * accum_update +
-                                       ((real)1 - rou) * (grad * lr).square());
-  auto expr4 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr5 = value.lazyAssign(value + mom);
-
-  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-}
-
-void adagradApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& accum_buffer,
-                  BaseMatrix& accum,
-                  BaseMatrix& lr,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate) {
-  auto expr1 = accum.lazyAssign(accum + grad.square());
-  auto expr2 =
-      lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr4 = value.lazyAssign(value + mom);
-
-  AssignEvaluate(expr1, expr2, expr3, expr4);
-}
-
-void rmspropApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& g,
-                  BaseMatrix& f,
-                  BaseMatrix& lr,
-                  real accumulatedRou,
-                  real rou,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate,
-                  bool firstTime) {
-  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
-  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
-  auto expr4 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr5 = value.lazyAssign(value + mom);
-
-  if (firstTime) {
-    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-  } else {
-    auto expr1 =
-        g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-  }
-}
-
-void decayedAdagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& mom,
-                         BaseMatrix& accum,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime) {
-  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr4 = value.lazyAssign(value + mom);
-
-  if (firstTime) {
-    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4);
-  } else {
-    auto expr1 = accum.lazyAssign(accumulatedRou * accum +
-                                  ((real)1 - rou) * grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4);
-  }
-}
-
-void adamApply(BaseMatrix& value,
-               BaseMatrix& grad,
-               BaseMatrix& mom,  // firse moment
-               BaseMatrix& v,    // second moment
-               real beta1,
-               real beta2,
-               real beta1_power,
-               real beta2_power,
-               real epsilon,
-               real learningRate) {
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-
-  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
-  auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-void adamaxApply(BaseMatrix& value,
-                 BaseMatrix& grad,
-                 BaseMatrix& mom,  // firse moment
-                 BaseMatrix& u,    // weighted infinity norm
-                 real beta1,
-                 real beta2,
-                 int64_t step,
-                 real alpha) {
-  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 =
-      u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
-  auto expr3 = value.lazyAssign(
-      value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-}  // namespace paddle
-
-#else
-
-namespace paddle {
-
-void sparseMomentumApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& momU,
-                         BaseMatrix& momV,
-                         real alpha,
-                         real beta,
-                         real gamma,
-                         real tau,
-                         real learningRate) {
-  /**
-   * \alpha_t = \alpha_{t-1} / k
-   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
-   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
-   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
-   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
-   */
-  momU -= (alpha * gamma * learningRate) * grad;
-  momV += (tau * alpha * gamma * learningRate) * grad;
-  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
-}
-
-void adadeltaApply(BaseMatrix& value,
-                   BaseMatrix& grad,
-                   BaseMatrix& mom,
-                   BaseMatrix& accum,
-                   BaseMatrix& accum_update,
-                   BaseMatrix& lr,
-                   real rou,
-                   real epsilon,
-                   real learningRate,
-                   real momentum,
-                   real decayRate) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  accum = rou * accum + ((real)1 - rou) * grad.square();
-
-  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
-  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void adagradApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& accum_buffer,
-                  BaseMatrix& accum,
-                  BaseMatrix& lr,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate) {
-  accum += grad.square();
-  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void rmspropApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& g,
-                  BaseMatrix& f,
-                  BaseMatrix& lr,
-                  real accumulatedRou,
-                  real rou,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate,
-                  bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  if (firstTime) {
-    g = accumulatedRou * g + grad.square();
-  } else {
-    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
-  }
-
-  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
-  f = accumulatedRou * f + ((real)1 - rou) * grad;
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  lr = (g - f.square() + epsilon).sqrt().reciprocal();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void decayedAdagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& mom,
-                         BaseMatrix& accum,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  if (firstTime) {
-    accum = accumulatedRou * accum + grad.square();
-  } else {
-    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
-  }
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  lr = (accum + epsilon).sqrt().reciprocal();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void adamApply(BaseMatrix& value,
-               BaseMatrix& grad,
-               BaseMatrix& mom,  // firse moment
-               BaseMatrix& v,    // second moment
-               real beta1,
-               real beta2,
-               real beta1_power,
-               real beta2_power,
-               real epsilon,
-               real learningRate) {
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  mom = beta1 * mom + ((real)1 - beta1) * grad;
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  v = beta2 * v + ((real)1 - beta2) * grad.square();
-
-  value -= (mom * alpha) / (v.sqrt() + epsilon);
-}
-
-void adamaxApply(BaseMatrix& value,
-                 BaseMatrix& grad,
-                 BaseMatrix& mom,  // firse moment
-                 BaseMatrix& u,    // weighted infinity norm
-                 real beta1,
-                 real beta2,
-                 int64_t step,
-                 real alpha) {
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  mom = beta1 * mom + ((real)1 - beta1) * grad;
-
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
-
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
-}
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.h b/paddle/legacy/math/TrainingAlgorithmOp.h
deleted file mode 100644
index 921c2742c..000000000
--- a/paddle/legacy/math/TrainingAlgorithmOp.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "BaseMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief Sparse Momentum optimizer.
- */
-extern void sparseMomentumApply(BaseMatrix& value,
-                                BaseMatrix& grad,
-                                BaseMatrix& momU,
-                                BaseMatrix& momV,
-                                real alpha,
-                                real beta,
-                                real gamma,
-                                real tau,
-                                real learningRate);
-
-/**
- * \brief AdaDelta optimizer.
- */
-extern void adadeltaApply(BaseMatrix& value,
-                          BaseMatrix& grad,
-                          BaseMatrix& sum,
-                          BaseMatrix& sum1,
-                          BaseMatrix& mom,
-                          BaseMatrix& lr,
-                          real rou,
-                          real epsilon,
-                          real learningRate,
-                          real momentum,
-                          real decayRate);
-
-/**
- * \brief AdaGrad optimizer.
- */
-extern void adagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& sum,
-                         BaseMatrix& sum1,
-                         BaseMatrix& mom,
-                         BaseMatrix& lr,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate);
-
-/**
- * \brief RMSProp optimizer.
- */
-extern void rmspropApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& g,
-                         BaseMatrix& f,
-                         BaseMatrix& mom,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime);
-
-/**
- * \brief Decayed AdaGrad optimizer.
- */
-extern void decayedAdagradApply(BaseMatrix& value,
-                                BaseMatrix& grad,
-                                BaseMatrix& mom,
-                                BaseMatrix& accum,
-                                BaseMatrix& lr,
-                                real accumulatedRou,
-                                real rou,
-                                real epsilon,
-                                real learningRate,
-                                real momentum,
-                                real decayRate,
-                                bool firstTime);
-
-/**
- * \brief Adam optimizer.
- */
-extern void adamApply(BaseMatrix& value,
-                      BaseMatrix& grad,
-                      BaseMatrix& mom,
-                      BaseMatrix& v,
-                      real beta1,
-                      real beta2,
-                      real beta1_power,
-                      real beta2_power,
-                      real epsilon,
-                      real learningRate);
-
-/**
- * \brief AdaMax optimizer.
- */
-extern void adamaxApply(BaseMatrix& value,
-                        BaseMatrix& grad,
-                        BaseMatrix& mom,  // firse moment
-                        BaseMatrix& u,    // weighted infinity norm
-                        real beta1,
-                        real beta2,
-                        int64_t step,
-                        real alpha);
-}  // namespace paddle
diff --git a/paddle/legacy/math/Vector.cpp b/paddle/legacy/math/Vector.cpp
deleted file mode 100644
index 87f48bb16..000000000
--- a/paddle/legacy/math/Vector.cpp
+++ /dev/null
@@ -1,1091 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Vector.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <memory>
-#include "Matrix.h"
-#include "hl_gpu.h"
-#include "hl_matrix.h"
-#include "hl_table_apply.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Thread.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-namespace paddle {
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuVectorT<T>>(size);
-  } else {
-    return std::make_shared<CpuVectorT<T>>(size);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
-    size_t size, bool useGpu, SyncThreadPool* pool) {
-  if (!useGpu && FLAGS_trainer_count > 1 && FLAGS_enable_parallel_vector &&
-      size >= (size_t)FLAGS_enable_parallel_vector) {
-    return std::make_shared<ParallelCpuVectorT<T>>(
-        size, pool ? pool : getGlobalSyncThreadPool());
-  } else {
-    return create(size, useGpu);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data,
-                                               size_t size,
-                                               bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuVectorT<T>>(size, data);
-  } else {
-    return std::make_shared<CpuVectorT<T>>(size, data);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
-                                               MemoryHandlePtr memoryHandle,
-                                               size_t offset) {
-  if (auto cpuMemHandle =
-          std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
-    return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
-  } else if (auto gpuMemHandle =
-                 std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
-    return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
-  } else {
-    LOG(FATAL) << "Wrong";
-    return NULL;
-  }
-}
-
-template <>
-MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  LOG(FATAL) << "Wrong for real vector";
-  return nullptr;
-}
-
-template <>
-MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  size_t height = getSize();
-  size_t width = idRange;
-  MatrixPtr mat = Matrix::createSparseMatrix(
-      height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
-
-  CpuIVector cpuIds(height);
-  cpuIds.copyFrom(*this);
-  int* idData = cpuIds.getData();
-
-  for (decltype(height) i = 0; i < height; i++) {
-    const unsigned int id = idData[i];
-    CHECK_LT(id, width);
-    mat->setRow(i, 1, &id, nullptr);
-  }
-  return mat;
-}
-
-template <>
-std::shared_ptr<VectorT<int>> VectorT<real>::castToInt() {
-  std::shared_ptr<VectorT<int>> ret = IVector::create(this->getSize(), useGpu_);
-  if (useGpu_) {
-    hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
-  } else {
-    for (size_t i = 0; i < getSize(); ++i) {
-      ret->getData()[i] = int(this->getData()[i]);
-    }
-  }
-  return ret;
-}
-
-template <class T>
-GpuVectorT<T>::GpuVectorT(size_t size)
-    : VectorT<T>(size,
-                 std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
-                 0, /* offset = 0 */
-                 true /* useGpu = true */) {}
-
-template <class T>
-T GpuVectorT<T>::getElement(size_t i) const {
-  T elem = 0;
-  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]), sizeof(T));
-  return elem;
-}
-template <class T>
-void GpuVectorT<T>::setElement(size_t i, const T& value) {
-  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value), sizeof(T));
-}
-
-template <class T>
-T* GpuVectorT<T>::getPoint(const uint64_t beginPos) {
-  LOG(FATAL) << "Not implemented" << beginPos;
-  return NULL;
-}
-
-template <>
-int GpuVectorT<int>::getAbsSum() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-int GpuVectorT<int>::getSum() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-real GpuVectorT<real>::getAbsSum() {
-  real* A = this->getData();
-  real sum = 0;
-  hl_vector_abs_sum(A, &sum, this->getSize());
-  return sum;
-}
-
-template <>
-real GpuVectorT<real>::getSum() {
-  real* A = this->getData();
-  real sum = 0;
-  hl_vector_sum(A, &sum, this->getSize());
-  return sum;
-}
-
-template <>
-int GpuVectorT<int>::getMax() {
-  CpuIVector cpuIVec = CpuIVector(this->getSize());
-  copyTo(&cpuIVec);
-  return cpuIVec.getMax();
-}
-
-template <>
-int GpuVectorT<int>::getAbsMax() {
-  CpuIVector cpuIVec = CpuIVector(this->getSize());
-  copyTo(&cpuIVec);
-  return cpuIVec.getAbsMax();
-}
-
-template <class T>
-void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
-  BaseMatrixT<T>::isEqualTo((BaseMatrixT<T>&)b, value);
-}
-
-template <class T>
-void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-#ifdef PADDLE_WITH_CUDA
-  hl_vector_select_from<T>(this->getData(),
-                           this->getSize(),
-                           src.getData(),
-                           src.getSize(),
-                           ids.getData(),
-                           ids.getSize());
-#endif
-}
-
-template <class Func>
-real gpuRowFunc(Func f, GpuVector& v) {
-  static ThreadLocal<std::unique_ptr<CpuVectorT<real>>> local;
-  if (!*local) {
-    (*local).reset(new CpuVector(1));
-  }
-  real* A = v.getData();
-  f(A, (*local)->getData(), 1, v.getSize());
-  return (*local)->getData()[0];
-}
-
-template <>
-real GpuVectorT<real>::getMax() {
-  return gpuRowFunc(hl_matrix_row_max, *this);
-}
-
-template <>
-real GpuVectorT<real>::getAbsMax() {
-  return std::max(gpuRowFunc(hl_matrix_row_max, *this),
-                  -gpuRowFunc(hl_matrix_row_min, *this));
-}
-
-template <>
-int GpuVectorT<int>::getMin() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-real GpuVectorT<real>::getMin() {
-  return gpuRowFunc(hl_matrix_row_min, *this);
-}
-
-template <class T>
-T GpuVectorT<T>::get(size_t pos) {
-  T val = (T)0;
-  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T));
-  return val;
-}
-
-template <class T>
-void GpuVectorT<T>::histogram(std::ostream& os, int type) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::zeroMem() {
-  BaseMatrixT<T>::zero();
-}
-
-template <class T>
-void GpuVectorT<T>::reset(const T& value) {
-  BaseMatrixT<T>::assign(value);
-}
-
-template <class T>
-void GpuVectorT<T>::fillSequence() {
-  LOG(FATAL) << "not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
-  src.copyTo(this);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  CHECK_EQ(src.getSize(), this->getSize());
-  hl_memcpy_async((void*)this->getData(),
-                  (void*)src.getData(),
-                  sizeof(T) * this->getSize(),
-                  stream);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size) {
-  CHECK(gpuSrc != NULL);
-  CHECK_LE(size, this->size_);
-
-  hl_memcpy((void*)this->getData(), (void*)gpuSrc, sizeof(T) * size);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
-  CHECK(gpuSrc != NULL);
-  CHECK_LE(size, this->size_);
-
-  hl_memcpy_async(
-      (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream);
-}
-
-template <class T>
-void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(T) * this->getSize());
-}
-
-template <class T>
-void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-
-  hl_memcpy_device2device((void*)dest->getData(),
-                          (void*)this->getData(),
-                          sizeof(T) * this->getSize());
-}
-
-template <>
-void GpuVectorT<int>::rand() {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
-  IVectorPtr dest = IVector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(int) * this->getSize());
-  dest->print(os, num);
-}
-
-template <>
-void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
-  VectorPtr dest = Vector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(int) * this->getSize());
-  dest->print(os, num);
-}
-
-template <>
-void GpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<int>::rand() {
-  LOG(FATAL) << "Not implemented";
-}
-template <>
-void GpuVectorT<real>::rand(size_t classNum) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::rand(size_t classNum) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<real>::rand() {
-  VectorPtr cPtr = Vector::create(this->size_, false);
-  cPtr->rand();
-
-  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(real));
-}
-
-template <>
-void GpuVectorT<int>::rand(size_t classNum) {
-  IVectorPtr cPtr = IVector::create(this->size_, false);
-  cPtr->rand(classNum);
-
-  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(int));
-}
-
-template <>
-void CpuVectorT<int>::rand(size_t classNum) {
-  size_t size = this->getSize();
-  int* data = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] =
-        std::min(classNum - 1,
-                 size_t(::rand() * (1. / ((double)RAND_MAX + 1)) * classNum));
-  }
-}
-
-template <>
-void CpuVectorT<real>::rand() {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] = ::rand() * (1. / (double)RAND_MAX);
-    // data[ii] = ((temp > RAND_MAX/2)? 1 : -1) *
-    // sqrt( abs((temp-RAND_MAX/2))/(double(RAND_MAX))/2048 );
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::randnorm(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void CpuVectorT<T>::uniform(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::randnorm(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::uniform(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::randnorm(real mean, real std) {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  unsigned int* seed = ThreadLocalRand::getSeed();
-  auto rand1 = [&]() { return (1. + ::rand_r(seed)) * (1. / (1. + RAND_MAX)); };
-  for (size_t i = 0; i < size - 1; i += 2) {
-    real r1 = rand1();
-    r1 = std::sqrt(-2 * std::log(r1));
-    real r2 = rand1();
-    data[i] = mean + std * r1 * cos(2 * M_PI * r2);
-    data[i + 1] = mean + std * r1 * sin(2 * M_PI * r2);
-  }
-  real r1 = rand1();
-  r1 = std::sqrt(-2 * std::log(r1));
-  real r2 = rand1();
-  data[size - 1] = mean + std * r1 * cos(2 * M_PI * r2);
-}
-
-template <>
-void CpuVectorT<real>::uniform(real left, real right) {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  real range = right - left;
-  unsigned int* seed = ThreadLocalRand::getSeed();
-  auto rand1 = [&]() { return ::rand_r(seed) * (1. / (1. + RAND_MAX)); };
-  for (size_t i = 0; i < size; ++i) {
-    data[i] = rand1() * range + left;
-  }
-}
-
-template <>
-void GpuVectorT<real>::randnorm(real mean, real std) {
-  CpuVector cpuVec = CpuVector(this->getSize());
-  cpuVec.randnorm(mean, std);
-
-  hl_memcpy_host2device(
-      data_, cpuVec.getData(), this->getSize() * sizeof(real));
-}
-
-template <>
-void GpuVectorT<real>::uniform(real left, real right) {
-  CpuVector cpuVec = CpuVector(this->getSize());
-  cpuVec.uniform(left, right);
-
-  hl_memcpy_host2device(
-      data_, cpuVec.getData(), this->getSize() * sizeof(real));
-}
-
-template <class T>
-CpuVectorT<T>::CpuVectorT(size_t size)
-    : VectorT<T>(size,
-                 std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
-                 0, /* offset = 0 */
-                 false /* useGpu = false */) {}
-
-template <class T>
-CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
-    : VectorT<T>(src.getSize(),
-                 src.getMemoryHandle(),
-                 0, /* offset = 0 */
-                 false /* useGpu = false */) {
-  if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
-    this->memoryHandle_ =
-        std::make_shared<CpuMemoryHandle>(sizeof(T) * this->getSize());
-    this->data_ = reinterpret_cast<T*>(this->memoryHandle_->getBuf());
-  }
-  src.copyTo(this);
-}
-
-template <class T>
-T CpuVectorT<T>::getAbsSum() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += (A[i] > 0) ? A[i] : -A[i];
-  }
-  return sum;
-}
-
-// cannot use above version, due to precision issue of float
-template <>
-real CpuVectorT<real>::getAbsSum() {
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  double sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += (A[i] > 0) ? A[i] : -A[i];
-  }
-  return sum;
-}
-
-template <class T>
-T CpuVectorT<T>::getSum() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += A[i];
-  }
-  return sum;
-}
-
-template <>
-real CpuVectorT<real>::getSum() {
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  double sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += A[i];
-  }
-  return sum;
-}
-
-template <class T>
-T CpuVectorT<T>::get(size_t pos) {
-  return this->getData()[pos];
-}
-
-template <class T>
-T CpuVectorT<T>::getMax() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = A[0];
-  for (size_t i = 1; i < size; i++) {
-    if (res < A[i]) res = A[i];
-  }
-  return res;
-}
-
-template <class T>
-T CpuVectorT<T>::getAbsMax() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = std::abs(A[0]);
-  for (size_t i = 1; i < size; i++) {
-    if (res < std::abs(A[i])) res = std::abs(A[i]);
-  }
-  return res;
-}
-
-template <class T>
-T CpuVectorT<T>::getMin() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = A[0];
-  for (size_t i = 1; i < size; i++) {
-    if (res > A[i]) res = A[i];
-  }
-  return res;
-}
-
-template <class T>
-void CpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
-  size_t size = this->getSize();
-  CHECK_EQ(b.getSize(), size);
-
-  const T* B = b.getData();
-  T* A = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = (B[i] == value);
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-  size_t size = this->getSize();
-  CHECK_EQ(ids.getSize(), size);
-
-  const int* indices = ids.getData();
-  const T* B = src.getData();
-  T* A = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    int index = indices[i];
-    CHECK_LT(index, (int)src.getSize());
-    A[i] = B[index];
-  }
-}
-
-static int getSignAndExponentOfFloat(float a) {
-  uint32_t* pa = reinterpret_cast<uint32_t*>(&a);
-  return *pa >> 23;
-}
-
-template <class T>
-void CpuVectorT<T>::histogram(std::ostream& os, int type) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::histogram(std::ostream& os, int type) {
-  int counters[512];
-  memset(counters, 0, sizeof(counters));
-  int counterZero = 0;
-
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    if (A[i] == 0.0f) {
-      ++counterZero;
-    } else {
-      ++counters[getSignAndExponentOfFloat(A[i])];
-    }
-  }
-
-  int64_t sum = 0;
-  float sizeNonZero = size - counterZero;
-  os << "zero:" << counterZero;
-  for (int i = 0; i < 256; i++) {
-    int counter = counters[i];
-    if (counter) {
-      os << " 2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
-      sum += counter * (i - 127);
-    }
-  }
-  for (int i = 0; i < 256; i++) {
-    int counter = counters[i + 256];
-    if (counter) {
-      os << " -2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
-      sum += counter * (i - 127);
-    }
-  }
-  os << ", nonzero_exponent_avg=" << sum / sizeNonZero;
-}
-
-template <class T>
-void CpuVectorT<T>::zeroMem() {
-  memset(this->getData(), 0, sizeof(T) * this->getSize());
-}
-
-template <class T>
-void CpuVectorT<T>::reset(const T& value) {
-  T* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = value;
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::fillSequence() {
-  T* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = i;
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
-  src.copyTo(this);
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  if (typeid(src) == typeid(GpuVectorT<T>)) {
-    hl_memcpy_async((void*)this->getData(),
-                    (void*)src.getData(),
-                    sizeof(T) * this->getSize(),
-                    stream);
-    // There is a need to add synchronization to ensure that the data is copied.
-    hl_stream_synchronize(stream);
-  } else {
-    src.copyTo(this);
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
-  CHECK(hostSrc != NULL);
-  CHECK_LE(size, this->size_);
-  memcpy(this->data_, hostSrc, sizeof(T) * size);
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc,
-                             size_t size,
-                             hl_stream_t stream) {
-  (void)stream;
-
-  CHECK(hostSrc != NULL);
-  CHECK_LE(size, this->size_);
-  memcpy(this->data_, hostSrc, sizeof(T) * size);
-}
-
-template <class T>
-void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-  memcpy(dest->getData(), this->getData(), sizeof(T) * this->getSize());
-}
-
-template <class T>
-void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-  hl_memcpy_host2device((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(T) * this->getSize());
-}
-
-template <>
-void CpuVectorT<real>::print(std::ostream& os, size_t num) const {
-  size_t w = size_ < num ? size_ : num;
-  os << "[";
-  for (size_t i = 0; i < w; ++i) {
-    os << data_[i] << " ";
-  }
-  os << "]" << std::endl;
-}
-
-template <>
-void CpuVectorT<int>::print(std::ostream& os, size_t num) const {
-  size_t w = size_ < num ? size_ : num;
-  os << "[";
-  for (size_t i = 0; i < w; ++i) {
-    os << (int)data_[i] << " ";
-  }
-  os << "]" << std::endl;
-}
-
-template <>
-void CpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, size_);
-  os << data_[idx] << ";";
-}
-
-template <>
-void CpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, size_);
-  os << (int)data_[idx] << ";";
-}
-
-template <class T>
-void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
-  pool_->exec([this, func](int tid, size_t numThreads) {
-    auto interval = calcSplitArrayInterval(
-        this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-    // setup sub bufs
-    CpuVector subVec(0, nullptr);
-    subVec.subVecFrom(*this, interval);
-    func(subVec);
-  });
-}
-
-template <class T>
-void ParallelCpuVectorT<T>::exec(SyncThreadPool::JobFunc func) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
-  pool_->exec(func);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
-  if (!useGpu) {
-    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
-  } else {
-    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size);
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
-    : sync_(nullptr) {
-  bool useGpu = src->useGpu();
-  if (useGpu) {
-    gpuVectorT_ = src;
-  } else {
-    cpuVectorT_ = src;
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
-    : sync_(nullptr) {
-  if (!useGpu) {
-    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
-    setSync(DATA_AT_CPU);
-  } else {
-    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size, data);
-    setSync(DATA_AT_GPU);
-  }
-}
-
-template <class T>
-std::shared_ptr<CpuGpuVectorT<T>> CpuGpuVectorT<T>::create(size_t size,
-                                                           bool useGpu) {
-  return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
-  if (useGpu) {
-    CHECK(gpuVectorT_) << "gpuVectorT_ is null";
-    // If memoryHandle_ is nullptr,
-    // the data may be owned by the caller when it was constructed.
-    // It should not resize for this case.
-    if (gpuVectorT_->getMemoryHandle()) {
-      gpuVectorT_->resize(size);
-    } else {
-      CHECK_EQ(gpuVectorT_->getSize(), size);
-    }
-  } else {
-    CHECK(cpuVectorT_) << "cpuVectorT_ is null";
-    // If memoryHandle_ is nullptr,
-    // the data may be owned by the caller when it was constructed.
-    // It should not resize for this case.
-    if (cpuVectorT_->getMemoryHandle()) {
-      cpuVectorT_->resize(size);
-    } else {
-      CHECK_EQ(cpuVectorT_->getSize(), size);
-    }
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                                      size_t size,
-                                      bool useGpu) {
-  if (vec) {
-    vec->resize(size, useGpu);
-  } else {
-    vec = create(size, useGpu);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
-  if (useGpu && (!gpuVectorT_)) {
-    gpuVectorT_ = VectorT<T>::create(size, true);
-  } else if ((!useGpu) && (!cpuVectorT_)) {
-    cpuVectorT_ = VectorT<T>::create(size, false);
-  } else {
-    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
-    this->resize(size, useGpu);
-  }
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
-                                size_t offset,
-                                size_t size)
-    : sync_(nullptr) {
-  CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
-#ifdef PADDLE_WITH_CUDA
-  SyncedFlag* flag = src.getSync();
-  if (*flag == DATA_AT_CPU) {
-    src.copyToGpu();  // will set synchronous data between CPU and GPU
-  } else if (*flag == DATA_AT_GPU) {
-    src.copyToCpu();  // will set synchronous data between CPU and GPU
-  }
-#endif
-  auto cMemHandle = (src.getVector(false))->getMemoryHandle();
-  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
-      size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
-#ifdef PADDLE_WITH_CUDA
-  auto gMemHandle = (src.getVector(true))->getMemoryHandle();
-  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
-      size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
-  src.setSync(SYNCED);
-#endif
-  setSync(src.getSync());
-}
-
-template <class T>
-std::shared_ptr<const VectorT<T>> CpuGpuVectorT<T>::getVector(
-    bool useGpu) const {
-  auto* self = const_cast<CpuGpuVectorT<T>*>(this);
-  if (useGpu) {
-    self->copyToGpu();
-    return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
-  } else {
-    self->copyToCpu();
-    return std::const_pointer_cast<const VectorT<T>>(cpuVectorT_);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>>& CpuGpuVectorT<T>::getMutableVector(bool useGpu) {
-  setSync(useGpu);
-  if (useGpu) {
-    copyToGpu();
-    return gpuVectorT_;
-  } else {
-    copyToCpu();
-    return cpuVectorT_;
-  }
-}
-
-template <class T>
-const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
-  auto self = const_cast<CpuGpuVectorT<T>*>(this);
-  if (useGpu) {
-    self->copyToGpu();
-    return gpuVectorT_->getData();
-  } else {
-    self->copyToCpu();
-    return cpuVectorT_->getData();
-  }
-}
-
-// Operation will change data and need to reset sync_ & syncFlag_.
-#define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
-  do {                                         \
-    if (useGpu) {                              \
-      copyToGpu();                             \
-      setSync(useGpu);                         \
-      return gpuVectorT_->OP(args);            \
-    } else {                                   \
-      copyToCpu();                             \
-      setSync(useGpu);                         \
-      return cpuVectorT_->OP(args);            \
-    }                                          \
-  } while (0)
-
-template <class T>
-T* CpuGpuVectorT<T>::getMutableData(bool useGpu) {
-  MUTABLE_VECTOR_OP(getData, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::zeroMem(bool useGpu) {
-  MUTABLE_VECTOR_OP(zeroMem, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::fillSequence(bool useGpu) {
-  MUTABLE_VECTOR_OP(fillSequence, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::setElement(size_t i, const T& value, bool useGpu) {
-  MUTABLE_VECTOR_OP(setElement, useGpu, i, value);
-}
-
-template <class T>
-T CpuGpuVectorT<T>::getElement(size_t i) const {
-  switch (*this->getSync()) {
-    case SYNCED:
-    case DATA_AT_CPU:
-      return cpuVectorT_->getElement(i);
-      break;
-    case DATA_AT_GPU:
-      return gpuVectorT_->getElement(i);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  auto cVec = dynamic_cast<const CpuVectorT<T>*>(&src);
-  auto gVec = dynamic_cast<const GpuVectorT<T>*>(&src);
-  if (cVec) {
-    copyToCpu(cVec->getData(), cVec->getSize(), stream);
-  } else if (gVec) {
-    copyToGpu(gVec->getData(), gVec->getSize(), stream);
-  } else {
-    LOG(FATAL) << "Invalid type of src";
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
-  if (useGpu) {
-    copyToGpu(data, size);
-  } else {
-    copyToCpu(data, size);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data,
-                                size_t size,
-                                hl_stream_t stream,
-                                bool useGpu) {
-  if (useGpu) {
-    copyToGpu(data, size, stream);
-  } else {
-    copyToCpu(data, size, stream);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-                                size_t offset,
-                                size_t size,
-                                bool useGpu,
-                                hl_stream_t stream) {
-  if (useGpu) {
-    VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
-    gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
-  } else {
-    VectorT<T>::resizeOrCreate(cpuVectorT_, size, false);
-    cpuVectorT_->copyFrom(src.getData(false) + offset, size, stream);
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream) {
-  switch (*src.getSync()) {
-    case DATA_AT_CPU:
-      copyFrom(*(src.getVector(false)), stream);
-      break;
-    case DATA_AT_GPU:
-      copyFrom(*(src.getVector(true)), stream);
-      break;
-    case SYNCED:
-      copyFrom(*(src.getVector(false)), stream);
-      copyFrom(*(src.getVector(true)), stream);
-      setSync(SYNCED);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyToCpu() {
-  switch (*this->getSync()) {
-    case DATA_AT_GPU:
-      CHECK(gpuVectorT_);
-      this->resizeOrCreate(gpuVectorT_->getSize(), false);
-      cpuVectorT_->copyFrom(*gpuVectorT_);
-      setSync(SYNCED);
-      break;
-    case DATA_AT_CPU:
-    case SYNCED:
-      CHECK(cpuVectorT_);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyToGpu() {
-  switch (*this->getSync()) {
-    case DATA_AT_CPU:
-      CHECK(cpuVectorT_);
-      this->resizeOrCreate(cpuVectorT_->getSize(), true);
-      gpuVectorT_->copyFrom(*cpuVectorT_);
-      setSync(SYNCED);
-      break;
-    case DATA_AT_GPU:
-    case SYNCED:
-      CHECK(gpuVectorT_);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template class VectorT<real>;
-template class VectorT<int>;
-template class CpuVectorT<real>;
-template class CpuVectorT<int>;
-template class GpuVectorT<real>;
-template class GpuVectorT<int>;
-template class CpuGpuVectorT<real>;
-template class CpuGpuVectorT<int>;
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/Vector.h b/paddle/legacy/math/Vector.h
deleted file mode 100644
index 63cb4651c..000000000
--- a/paddle/legacy/math/Vector.h
+++ /dev/null
@@ -1,726 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <memory>
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Thread.h"
-
-namespace paddle {
-
-template <class T>
-class GpuVectorT;
-template <class T>
-class CpuVectorT;
-
-template <class T>
-class BaseVector;
-
-class SyncThreadPool;
-
-class Matrix;
-
-template <class T>
-class BaseVector : public BaseMatrixT<T> {
- public:
-  BaseVector(size_t size, T* data, bool useGpu)
-      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
-
-  ~BaseVector() {}
-
- protected:
-  size_t& size_;
-};
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-template <class T>
-class VectorT : public BaseVector<T> {
- protected:
-  VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
-      : BaseVector<T>(size,
-                      reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
-                      useGpu) {
-    memoryHandle_ = memoryHandle;
-  }
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  VectorT(size_t size, T* data, bool useGpu)
-      : BaseVector<T>(size, data, useGpu) {}
-
- public:
-  virtual ~VectorT() {}
-
-  static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
-
-  static std::shared_ptr<VectorT<T>> create(T* data, size_t size, bool useGpu);
-
-  static std::shared_ptr<VectorT<T>> create(size_t size,
-                                            MemoryHandlePtr memoryHandle,
-                                            size_t offset = 0);
-
-  // owner can set SyncThreadPool,
-  // if not set, will use globalSyncThreadPool,
-  // which can be used in main thread only.
-  static std::shared_ptr<VectorT<T>> createParallelVector(
-      size_t size, bool useGpu, SyncThreadPool* pool = nullptr);
-
-  size_t getSize() const { return this->size_; }
-  const T* getData() const { return this->data_; }
-  T* getData() { return this->data_; }
-
-  virtual void zeroMem() = 0;
-  // set all elements to value
-  virtual void reset(const T& value) = 0;
-  // fill data by 0, 1, 2, ...
-  virtual void fillSequence() = 0;
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  /**
-   * resizing to a big vector will not preserve old values.
-   */
-  void resize(size_t newSize) {
-    if (!memoryHandle_ || newSize * sizeof(T) > memoryHandle_->getAllocSize()) {
-      memoryHandle_ = newMemory(newSize * sizeof(T));
-      this->data_ = reinterpret_cast<T*>(memoryHandle_->getBuf());
-    }
-    this->size_ = newSize;
-  }
-
-  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
-                             size_t size,
-                             bool useGpu) {
-    if (vec) {
-      vec->resize(size);
-    } else {
-      vec = create(size, useGpu);
-    }
-  }
-
-  virtual MemoryHandlePtr newMemory(size_t size) = 0;
-
-  /**
-   * form sub vector from *src*, shallow copy
-   */
-  void subVecFrom(const VectorT<T>& src, size_t start, size_t size) {
-    CHECK_EQ(BaseVector<T>::useGpu_, src.useGpu_);
-    CHECK_LT(start, src.size_);
-    CHECK_LE(start + size, src.size_);
-
-    BaseVector<T>::size_ = size;
-    BaseVector<T>::data_ = const_cast<T*>(src.data_) + start;
-  }
-
-  std::shared_ptr<VectorT<T>> subVec(size_t start, size_t size) {
-    CHECK_LE(start + size, static_cast<size_t>(getSize()));
-    return VectorT<T>::create(getData() + start, size, BaseVector<T>::useGpu_);
-  }
-
-  /**
-   * form sub vector from *src*, shallow copy
-   */
-  void subVecFrom(const T* src, size_t start, size_t size) {
-    BaseVector<T>::size_ = size;
-    BaseVector<T>::data_ = const_cast<T*>(src) + start;
-  }
-
-  /**
-   * form sub vector from *src*, shallow copy
-   * in *interval* [interval.first, interval.second)
-   */
-  void subVecFrom(const VectorT<T>& src, std::pair<size_t, size_t> interval) {
-    subVecFrom(src, interval.first, interval.second - interval.first);
-  }
-
-  /**
-   * convert the vector to a sparse one_hot matrix of width idRange
-   * only applies to IVector
-   */
-  std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
-
-  /**
-   * @brief cast vector of "real" elements to "int" elements.
-   *
-   * @note: float -> int must be casted, or you'll get wrong data.
-   */
-  std::shared_ptr<VectorT<int>> castToInt();
-
-  /**
-   * This function will crash if the size of src and dest is different.
-   */
-  virtual void copyFrom(const VectorT<T>& src) = 0;
-
-  /**
-   * If GpuVector, this function is an asynchronous interface,
-   * will push the copy-task to the specifed-stream and return immediately.
-   *
-   * If CpuVector, this function is an synchronous interface,
-   * same as the copyFrom(const VectorT<T>& src).
-   */
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
-
-  /**
-   * copy size elements from src
-   *
-   * If this is GpuVector, src can be cpu or gpu memory
-   *
-   * If this is CpuVector, src is assumed to be cpu memory
-   */
-  virtual void copyFrom(const T* src, size_t size) = 0;
-
-  /**
-   * copy size elements from src
-   *
-   * If this is GpuVector, src can be cpu or gpu memory
-   *
-   * If this is CpuVector, src is assumed to be cpu memory,
-   */
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream) = 0;
-
-  /**
-   * exec a func in single/multi thread
-   */
-  virtual void exec(SyncThreadPool::JobFunc func) { func(0, 1); }
-
-  /// Get the buffer point with beginPos
-  virtual T* getPoint(const uint64_t beginPos) = 0;
-
-  /// Get the value for the i'th element
-  virtual T getElement(size_t i) const = 0;
-  virtual void setElement(size_t i, const T& value) = 0;
-
-  //----------  math operations ----------------
-
-  // sum of the absolute value of each elements
-  virtual T getAbsSum() = 0;
-
-  virtual T getSum() = 0;
-  virtual T getMax() = 0;
-  virtual T getAbsMax() = 0;
-  virtual T getMin() = 0;
-
-  /// element-wise calc:  this = (b == value)
-  virtual void isEqualTo(const VectorT<T>& b, const T& value) = 0;
-
-  /// select elements indexed by *ids* from vector *src*
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids) = 0;
-
-  enum HistogramType {
-    HISTOGRAM_EXPONENT = 0,
-  };
-
-  /**
-   * @brief  print histogram of vector values
-   *
-   * @note   only exponent histogram supported currently
-   */
-  virtual void histogram(std::ostream& os, int type = HISTOGRAM_EXPONENT) = 0;
-
-  /// generate uniform random value for each element
-  virtual void rand() = 0;
-  /**
-   * generate uniform random value for each element,
-   * data range is from 0 to (classes - 1).
-   */
-  virtual void rand(size_t classes) = 0;
-
-  /**
-   * Debug use only. Very inefficient for GPU vector.
-   * get the value at pos.
-   */
-  virtual T get(size_t pos) = 0;
-
-  /**
-   * generate univariate Gaussian distributed random numbers
-   * with given mean and standardDeviation.
-   */
-  virtual void randnorm(real mean, real standardDeviation) = 0;
-
-  /**
-   * generate uniform distributed random numbers
-   * with given range.
-   */
-  virtual void uniform(real left, real right) = 0;
-
-  /// print the first "num" elements of the Vector
-  virtual void print(std::ostream& os, size_t num) const = 0;
-
-  /// print the "idx" element of the Vector
-  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (BaseVector<T>::useGpu_) {
-      TensorGpuApply<T>(*this, expr);
-    } else {
-      TensorCpuApply<T>(*this, expr);
-    }
-  }
-
- protected:
-  friend class GpuVectorT<T>;
-  friend class CpuVectorT<T>;
-  virtual void copyTo(CpuVectorT<T>* dest) const = 0;
-  virtual void copyTo(GpuVectorT<T>* dest) const = 0;
-  MemoryHandlePtr memoryHandle_;
-};
-
-template <class T>
-std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
-  vec.print(os, vec.getSize());
-  return os;
-}
-
-template <class T>
-class GpuVectorT : public VectorT<T> {
- public:
-  explicit GpuVectorT(size_t size);
-  GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
-      : VectorT<T>(size, memHandle, offset, true) {}
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  GpuVectorT(size_t size, T* data) : VectorT<T>(size, data, true) {}
-
-  virtual MemoryHandlePtr newMemory(size_t size) {
-    return std::make_shared<GpuMemoryHandle>(size);
-  }
-  virtual void zeroMem();
-  virtual void reset(const T& value);
-  virtual void fillSequence();
-
-  virtual void copyFrom(const T* src, size_t size);
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
-  virtual void copyFrom(const VectorT<T>& src);
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-  virtual T getElement(size_t i) const;
-  virtual void setElement(size_t i, const T& value);
-  virtual T* getPoint(const uint64_t beginPos);
-
-  virtual T getAbsSum();
-  virtual T getSum();
-  virtual T getMax();
-  virtual T getAbsMax();
-  virtual T getMin();
-  virtual void isEqualTo(const VectorT<T>& b, const T& value);
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
-  virtual void histogram(std::ostream& os, int type);
-  virtual void rand();
-  virtual void rand(size_t classes);
-  virtual void randnorm(real mean, real standardDeviation);
-  virtual void uniform(real left, real right);
-  virtual T get(size_t pos);
-  virtual void print(std::ostream& os, size_t num) const;
-  virtual void printOneElement(std::ostream& os, size_t idx) const;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<T>(*this, expr);
-  }
-
- protected:
-  virtual void copyTo(CpuVectorT<T>* dest) const;
-  virtual void copyTo(GpuVectorT<T>* dest) const;
-};
-
-template <class T>
-class CpuVectorT : public VectorT<T> {
- public:
-  explicit CpuVectorT(size_t size);
-  CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
-      : VectorT<T>(size, memoryHandle, offset, false) {}
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  CpuVectorT(size_t size, T* data) : VectorT<T>(size, data, false) {}
-
-  /**
-   * If src is a CpuVector, the new CpuVector will share the data with src
-   *
-   * If src is a GpuVector, the new CpuVector will copy data from src
-   */
-  explicit CpuVectorT(const VectorT<T>& src);
-
-  virtual MemoryHandlePtr newMemory(size_t size) {
-    return std::make_shared<CpuMemoryHandle>(size);
-  }
-
-  virtual void zeroMem();
-  virtual void reset(const T& value);
-  virtual void fillSequence();
-  virtual void copyFrom(const T* src, size_t size);
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
-  virtual void copyFrom(const VectorT<T>& src);
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-  virtual void copyTo(CpuVectorT<T>* dest) const;
-  virtual void copyTo(GpuVectorT<T>* dest) const;
-
-  /// Get the buffer point with beginPos
-  virtual T* getPoint(const uint64_t beginPos) {
-    return this->getData() + beginPos;
-  }
-
-  virtual T getElement(size_t i) const { return this->getData()[i]; }
-  virtual void setElement(size_t i, const T& value) {
-    this->getData()[i] = value;
-  }
-
-  virtual T getAbsSum();
-  virtual T getSum();
-  virtual T getMax();
-  virtual T getAbsMax();
-  virtual T getMin();
-  virtual void isEqualTo(const VectorT<T>& b, const T& value);
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
-  virtual void histogram(std::ostream& os, int type);
-  virtual void rand();
-  virtual void rand(size_t classes);
-  virtual void randnorm(real mean, real standardDeviation);
-  virtual void uniform(real left, real right);
-  virtual T get(size_t pos);
-  virtual void print(std::ostream& os, size_t num) const;
-  virtual void printOneElement(std::ostream& os, size_t idx) const;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<T>(*this, expr);
-  }
-};
-
-template <class T>
-class ParallelCpuVectorT : public CpuVectorT<T> {
- public:
-  ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
-      : CpuVectorT<T>(size), pool_(pool) {}
-
-  virtual void zeroMem() {
-    parallelExec([](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::zeroMem(); });
-  }
-  virtual void randnorm(real mean, real standardDeviation) {
-    parallelExec([=](CpuVectorT<T>& vec) {
-      vec.CpuVectorT<T>::randnorm(mean, standardDeviation);
-    });
-  }
-  virtual void uniform(real left, real right) {
-    parallelExec(
-        [=](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::uniform(left, right); });
-  }
-
-  virtual void exec(SyncThreadPool::JobFunc jobFunc);
-
- private:
-  typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
-  void parallelExec(ExecFunc func);
-  SyncThreadPool* pool_;
-};
-
-/**
- * A class to do conversion between CpuVector and GpuVector automatically.
- */
-template <class T>
-class CpuGpuVectorT {
- public:
-  /**
-   * @brief An enum type of SyncedFlag using to
-   *        mark data memory is in CPU or GPU.
-   *
-   * DATA_AT_CPU: data is located in CPU.
-   *
-   * DATA_AT_GPU: data is located in GPU.
-   *
-   * SYNCED: data is located in CPU and GPU simultaneously.
-   */
-  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
-
-  /**
-   * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
-   *
-   * @param[in] size    data size.
-   * @param[in] useGpu  use gpu or not.
-   */
-  explicit CpuGpuVectorT(size_t size, bool useGpu);
-
-  /**
-   * @brief A constructor, create CpuGpuVectorT by VectorT.
-   *
-   * If src is CpuVector, cpuVectorT_ is shared data with src.
-   *
-   * If src is GpuVector, gpuVectorT_ is shared data with src.
-   */
-  explicit CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src);
-
-  /**
-   * @brief A constructor.
-   *
-   * If useGpu is true, data should be located in device and
-   * create gpuVectorT_ with data.
-   *
-   * If useGpu is false, data should be located in host and
-   * create cpuVectorT_ with data.
-   *
-   * @note Data is owned by the caller and should be valid during
-   *       the life of this vector.
-   *       Caller is responsible for release the memory.
-   */
-  CpuGpuVectorT(size_t size, T* data, bool useGpu);
-
-  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
-
-  virtual ~CpuGpuVectorT() {}
-
-  static std::shared_ptr<CpuGpuVectorT<T>> create(size_t size, bool useGpu);
-
-  /**
-   * @brief resize vector.
-   *
-   * If useGpu is true, resize gpuVectorT_ and set syncFlag_ to DATA_AT_GPU,
-   *
-   * otherwise resize cpuVectorT_ and set syncFlag_ to DATA_AT_CPU.
-   */
-  void resize(size_t size, bool useGpu);
-
-  /**
-   * @brief resize or create CpuGpuVectorT.
-   */
-  static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                             size_t size,
-                             bool useGpu);
-
-  /**
-   * @brief return a const cpuVectorT_ or gpuVectorT_.
-   *
-   * If useGpu is true, return gpuVectorT_.
-   *
-   * If useGpu is false, return cpuVectorT_.
-   *
-   * @note Caller should not change the data.
-   *       If caller changes const attribute,
-   *       should set syncFlag_.
-   */
-  std::shared_ptr<const VectorT<T>> getVector(bool useGpu) const;
-
-  /**
-   * @brief return a const cpuVectorT_ or gpuVectorT_.
-   *
-   * @note: This interface will change syncFlag_, so if you will
-   *        not change the data, you should call getVector.
-   */
-  std::shared_ptr<VectorT<T>>& getMutableVector(bool useGpu);
-
-  /**
-   * @brief return const T* data.
-   *
-   * If useGpu is true, return device data.
-   *
-   * If useGpu is false, return host data.
-   */
-  const T* getData(bool useGpu) const;
-
-  // TODO(yuyang18): Make getData more c++ style.
-  //  inline T* getData(bool useGpu) {
-  //    return getMutableData(useGpu);
-  //  }
-
-  T* getMutableData(bool useGpu);
-
-  /**
-   * If useGpu is true, gpuVectorT_->Op().
-   *
-   * If useGpu is false, cpuVectorT_->Op().
-   *
-   * Op is zeroMem, fillSequence, ...
-   */
-  void zeroMem(bool useGpu);
-  void fillSequence(bool useGpu);
-  void setElement(size_t i, const T& value, bool useGpu);
-
-  /**
-   * @brief return i-th element.
-   */
-  T getElement(size_t i) const;
-
-  /**
-   * @brief return vector size.
-   */
-  size_t getSize() const {
-    size_t size = 0;
-    switch (*sync_) {
-      case SYNCED:
-      case DATA_AT_CPU:
-        size = cpuVectorT_->getSize();
-        break;
-      case DATA_AT_GPU:
-        size = gpuVectorT_->getSize();
-        break;
-      default:
-        LOG(FATAL) << "Not support";
-        break;
-    }
-    return size;
-  }
-
-  /// copy data to cpuVectorT_.
-  inline void copyToCpu(const T* data, size_t size) {
-    this->resizeOrCreate(size, false);
-    cpuVectorT_->copyFrom(data, size);
-    setSync(DATA_AT_CPU);
-  }
-  /// copy data to cpuVectorT_ using specifed-stream.
-  inline void copyToCpu(const T* data, size_t size, hl_stream_t stream) {
-    this->resizeOrCreate(size, false);
-    cpuVectorT_->copyFrom(data, size, stream);
-    setSync(DATA_AT_CPU);
-  }
-
-  /// copy data to gpuVectorT_.
-  inline void copyToGpu(const T* data, size_t size) {
-    this->resizeOrCreate(size, true);
-    gpuVectorT_->copyFrom(data, size);
-    setSync(DATA_AT_GPU);
-  }
-  /// copy data to gpuVectorT_ using specifed-stream.
-  inline void copyToGpu(const T* data, size_t size, hl_stream_t stream) {
-    this->resizeOrCreate(size, true);
-    gpuVectorT_->copyFrom(data, size, stream);
-    setSync(DATA_AT_GPU);
-  }
-
-  /**
-   * @brief copy from src using specifed-stream.
-   *
-   * If src is CpuVectorT, copy to cpuVectorT_.
-   *
-   * If src is GpuVectorT, copy to gpuVectorT_.
-   */
-  void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-
-  /**
-   * @brief copy data.
-   *
-   * If useGpu is false, copy host data to cpuVectorT_.
-   *
-   * If useGpu is true, copy device data to gpuVectorT_.
-   *
-   * @note  data address should consistent with useGpu.
-   */
-  void copyFrom(const T* data, size_t size, bool useGpu);
-  void copyFrom(const T* data, size_t size, hl_stream_t stream, bool useGpu);
-
-  /**
-   * @brief copy from (src + offset) using specifed-stream.
-   */
-  void copyFrom(CpuGpuVectorT<T>& src,
-                size_t offset,
-                size_t size,
-                bool useGpu,
-                hl_stream_t stream);
-
-  /**
-   * @brief copy from src using specifed-stream.
-   */
-  void copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream);
-
-  /**
-   * @brief return sync_.
-   */
-  inline SyncedFlag* getSync() const { return sync_; }
-
-  /**
-   * @brief set sync_.
-   */
-  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
-
-  inline void setSync(SyncedFlag syncFlag) {
-    if (sync_) {
-      *sync_ = syncFlag;
-    } else {
-      syncFlag_ = syncFlag;
-      sync_ = &syncFlag_;
-    }
-  }
-
-  inline void setSync(bool useGpu) {
-    SyncedFlag flag = useGpu ? DATA_AT_GPU : DATA_AT_CPU;
-    setSync(flag);
-  }
-
- protected:
-  void resizeOrCreate(size_t size, bool useGpu);
-
-  /**
-   * @brief copy between cpuVectorT_ and gpuVectorT_.
-   *
-   * If syncFlag_ is DATA_AT_CPU and SYNCED, do nothing.
-   *
-   * If syncFlag_ is DATA_AT_GPU, copy gpuVectorT_ to cpuVectorT_
-   *   and set syncFlag_ to SYNCED.
-   */
-  void copyToCpu();
-
-  /**
-   * @brief copy between cpuVectorT_ and gpuVectorT_.
-   *
-   * If syncFlag_ is DATA_AT_GPU and SYNCED, do nothing.
-   *
-   * If syncFlag_ is DATA_AT_CPU, copy cpuVectorT_ to gpuVectorT_
-   *   and set syncFlag_ to SYNCED.
-   */
-  void copyToGpu();
-
-  /// host pointer.
-  std::shared_ptr<VectorT<T>> cpuVectorT_;
-  /// device pointer.
-  std::shared_ptr<VectorT<T>> gpuVectorT_;
-  /// specify current data address.
-  SyncedFlag syncFlag_;
-  SyncedFlag* sync_;
-};
-
-typedef VectorT<real> Vector;
-typedef CpuVectorT<real> CpuVector;
-typedef GpuVectorT<real> GpuVector;
-
-typedef VectorT<int> IVector;
-typedef CpuVectorT<int> CpuIVector;
-typedef GpuVectorT<int> GpuIVector;
-
-typedef std::shared_ptr<Vector> VectorPtr;
-typedef std::shared_ptr<CpuVector> CpuVectorPtr;
-typedef std::shared_ptr<GpuVector> GpuVectorPtr;
-
-typedef std::shared_ptr<IVector> IVectorPtr;
-typedef std::shared_ptr<CpuIVector> CpuIVectorPtr;
-typedef std::shared_ptr<GpuIVector> GpuIVectorPtr;
-
-typedef CpuGpuVectorT<real> CpuGpuVector;
-typedef CpuGpuVectorT<int> ICpuGpuVector;
-typedef std::shared_ptr<CpuGpuVector> CpuGpuVectorPtr;
-typedef std::shared_ptr<ICpuGpuVector> ICpuGpuVectorPtr;
-
-}  // namespace paddle
diff --git a/paddle/legacy/math/tests/CMakeLists.txt b/paddle/legacy/math/tests/CMakeLists.txt
deleted file mode 100644
index d8b7f9e3f..000000000
--- a/paddle/legacy/math/tests/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-# unittest for common package
-
-add_simple_unittest(test_ExecViaCpu)
-add_simple_unittest(test_SIMDFunctions)
-add_simple_unittest(test_TrainingAlgorithm)
-add_simple_unittest(test_RowBuffer)
-if(NOT MOBILE_INFERENCE)
-    add_simple_unittest(test_SparseMatrix)
-endif()
-
-# TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
-add_unittest(test_matrixCompare
-    test_matrixCompare.cpp)
-
-add_simple_unittest(test_sparseMatrixCompare)
-add_simple_unittest(test_perturbation)
-add_simple_unittest(test_CpuGpuVector)
-add_simple_unittest(test_Allocator)
-
-if(WITH_GPU)
-    CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
-    link_paddle_test(test_Tensor)
-    CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
-    link_paddle_test(test_lazyAssign)
-else()
-    compile_cu_as_cpp(test_Tensor.cu)
-    add_unittest(test_Tensor test_Tensor.cu)
-    compile_cu_as_cpp(test_lazyAssign.cu)
-    add_unittest(test_lazyAssign test_lazyAssign.cu)
-endif(WITH_GPU)
-
-add_simple_unittest(test_FPException)
-add_simple_unittest(test_GpuProfiler)
-add_simple_unittest(test_BaseMatrix)
-add_simple_unittest(test_Matrix)
diff --git a/paddle/legacy/math/tests/OriginalOptimizerApi.h b/paddle/legacy/math/tests/OriginalOptimizerApi.h
deleted file mode 100644
index f386e1995..000000000
--- a/paddle/legacy/math/tests/OriginalOptimizerApi.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-
-using namespace paddle;  // NOLINT
-
-void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
-                                      real alpha,
-                                      real beta,
-                                      real gamma,
-                                      real tau,
-                                      real learningRate) {
-  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                   -alpha * gamma * learningRate);
-  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                   tau * alpha * gamma * learningRate);
-  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                             tau / beta + 1.0 / alpha,
-                             *vecs[PARAMETER_MOMENTUM_VT],
-                             1.0 / beta);
-}
-
-void AdagradParameterOptimizer(const VectorPtr vecs[],
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate) {
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
-                                                1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
-                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
-                                real rou,
-                                real epsilon,
-                                real learningRate,
-                                real momentum,
-                                real decayRate) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
-
-  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
-  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon,
-                                        epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
-      *vecs[PARAMETER_GRADIENT],
-      *vecs[PARAMETER_LEARNING_RATE],
-      rou,
-      1.0f - rou);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void RMSPropParameterOptimizer(const VectorPtr vecs[],
-                               real accumulatedRou,
-                               real rou,
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate,
-                               bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                           -1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
-                                      real accumulatedRou,
-                                      real rou,
-                                      real epsilon,
-                                      real learningRate,
-                                      real momentum,
-                                      real decayRate,
-                                      bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdamParameterOptimizer(const VectorPtr vecs[],
-                            real beta1,
-                            real beta2,
-                            real beta1_power,
-                            real beta2_power,
-                            real epsilon,
-                            real learningRate) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  g->square2();
-  v->add(*g, beta2, 1 - beta2);
-
-  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
-  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
-  g->sqrt2(*v);
-  g->dotDiv(*m, *g, 0., epsilon);
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-  theta->add(*theta, 1.0, *g, -alpha);
-}
-
-void AdamaxParameterOptimizer(
-    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u->mulScalar(beta2);
-  g->abs2();
-  u->max2(*u, *g);
-
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  g->dotDiv(*m, *u);
-  real learningRate = alpha / (1 - std::pow(beta1, step));
-  theta->add(*theta, 1.0, *g, -learningRate);
-}
diff --git a/paddle/legacy/math/tests/PerfUtils.h b/paddle/legacy/math/tests/PerfUtils.h
deleted file mode 100644
index eaf4869e4..000000000
--- a/paddle/legacy/math/tests/PerfUtils.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// Performance Check
-#ifdef PADDLE_DISABLE_TIMER
-
-#define EXPRESSION_PERFORMANCE(expression) expression;
-
-#else
-
-#include "paddle/legacy/utils/Stat.h"
-using namespace paddle;  // NOLINT
-
-#define EXPRESSION_PERFORMANCE(expression)                             \
-  do {                                                                 \
-    char expr[30];                                                     \
-    strncpy(expr, #expression, 30);                                    \
-    if (expr[29] != '\0') {                                            \
-      expr[27] = '.';                                                  \
-      expr[28] = '.';                                                  \
-      expr[29] = '\0';                                                 \
-    }                                                                  \
-    expression;                                                        \
-    for (int i = 0; i < 20; i++) {                                     \
-      REGISTER_TIMER(expr);                                            \
-      expression;                                                      \
-    }                                                                  \
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
-              << *globalStat.getStat(expr);                            \
-    globalStat.reset();                                                \
-  } while (0)
-
-#endif
diff --git a/paddle/legacy/math/tests/TensorCheck.h b/paddle/legacy/math/tests/TensorCheck.h
deleted file mode 100644
index 41c8ece28..000000000
--- a/paddle/legacy/math/tests/TensorCheck.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a TensorCheck template function, which can be used to
- * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
- */
-
-#include <cmath>
-#include "paddle/legacy/math/Matrix.h"
-
-namespace autotest {
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::VectorT;
-using paddle::CpuVectorT;
-using paddle::GpuVectorT;
-
-class AssertEqual {
- public:
-  AssertEqual(real err = 0) : err_(err) {}
-
-  inline bool operator()(real a, real b) {
-    if (err_ == 0) {
-      if (a != b) {
-        return false;
-      }
-    } else {
-      if (std::fabs(a - b) > err_) {
-        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
-          return false;
-        }
-      }
-    }
-
-    return true;
-  }
-
- private:
-  real err_;
-};
-
-template <typename Tensor>
-class CopyToCpu;
-
-template <>
-class CopyToCpu<CpuMatrix> {
- public:
-  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
-  const CpuMatrix& copiedArg() const { return arg_; }
-
- private:
-  const CpuMatrix& arg_;
-};
-
-template <>
-class CopyToCpu<GpuMatrix> {
- public:
-  explicit CopyToCpu(const GpuMatrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
- private:
-  CpuMatrix arg_;
-};
-
-template <>
-class CopyToCpu<Matrix> {
- public:
-  explicit CopyToCpu(const Matrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
- private:
-  CpuMatrix arg_;
-};
-
-template <typename T>
-class CopyToCpu<CpuVectorT<T>> {
- public:
-  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
-  const CpuVectorT<T>& copiedArg() const { return arg_; }
-
- private:
-  const CpuVectorT<T>& arg_;
-};
-
-template <typename T>
-class CopyToCpu<GpuVectorT<T>> {
- public:
-  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
- private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename T>
-class CopyToCpu<VectorT<T>> {
- public:
-  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
- private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare,
-                 const CpuMatrix& matrix1,
-                 const CpuMatrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (!compare(a, b)) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-template <typename AssertEq, class T>
-void TensorCheck(AssertEq compare,
-                 const CpuVectorT<T>& vector1,
-                 const CpuVectorT<T>& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const T* data1 = vector1.getData();
-  const T* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (!compare(a, b)) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
-}
-
-template <typename AssertEq, typename Tensor1, typename Tensor2>
-void TensorCheck(AssertEq compare,
-                 const Tensor1& tensor1,
-                 const Tensor2& tensor2) {
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, real args1, real args2) {
-  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
-                                         << ", args2 = " << args2;
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
-  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
-                          << ", args2 = " << args2;
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
-  AssertEqual compare(0);
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
-#ifndef PADDLE_TYPE_DOUBLE
-  AssertEqual compare(1e-3);
-#else
-  AssertEqual compare(1e-10);
-#endif
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-}  // namespace autotest
diff --git a/paddle/legacy/math/tests/TestUtils.h b/paddle/legacy/math/tests/TestUtils.h
deleted file mode 100644
index 60e76359d..000000000
--- a/paddle/legacy/math/tests/TestUtils.h
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a AutoCompare calss to simplify the comparison
- * of CPU and GPU member functions.
- *
- * This takes two steps
- * 1. Construct an AutoCompare object.
- *    When constructing an AutoCompare object, you can set the err argument
- * to specify the maximum error for CPU and GPU functions.
- *
- * 2. Use the template functions cmpWithArg or cmpWithoutArg.
- * A. [cmpWithArg] Requires the caller construct the cpu arguments.
- *
- *  AutoCompare test;
- *  Init Argument arg1,arg2...
- *  test.cmpWithArg(function, arg1, arg2....)
- *
- * B. [cmpWithoutArg] The caller do not need construct arguments.
- *    If matrix used in these functions arguments is the same size.
- *    Such as the element wise function and the aggregate function
- *    defined in the BaseMatrix.cpp.
- *
- *  AutoCompare test;
- *  test.cmpWithoutArg<I...>(function, height, width)
- */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace autotest {
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using paddle::CpuSparseMatrix;
-using paddle::GpuSparseMatrix;
-
-template <typename T1, typename T2>
-class ReplaceType {
- public:
-  typedef T1 type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, CpuMatrix> {
- public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, GpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, CpuMatrix> {
- public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, GpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-// construct a argument
-template <typename T>
-T construct(int height, int width);
-
-template <>
-float construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-double construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-size_t construct(int height, int width) {
-  size_t offset = std::rand() % (height < width ? height : width);
-  return offset;
-}
-
-template <>
-CpuMatrix construct(int height, int width) {
-  CpuMatrix a(height, width);
-  return a;
-}
-
-template <>
-GpuMatrix construct(int height, int width) {
-  GpuMatrix a(height, width);
-  return a;
-}
-
-// init a argument
-template <typename T>
-void init(T& v) {
-  return;
-}
-
-template <>
-void init(CpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-template <>
-void init(GpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-// init a tuple which contains a set of arguments.
-template <std::size_t I = 0, typename... Args>
-inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
-    std::tuple<Args...>& t) {}
-
-template <std::size_t I = 0, typename... Args>
-    inline typename std::enable_if <
-    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
-  init(std::get<I>(t));
-  initTuple<I + 1>(t);
-}
-
-// copy a argument, copy src to dest
-template <typename T1, typename T2>
-void copy(T1& dest, T2& src) {
-  dest = src;
-}
-
-template <>
-void copy(GpuMatrix& dest, CpuMatrix& src) {
-  dest.copyFrom(src);
-}
-
-// copy a tuple, copy src to dest
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
-    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
-
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-    inline typename std::enable_if <
-    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
-                                              std::tuple<Args2...>& src) {
-  copy(std::get<I>(dest), std::get<I>(src));
-  copyTuple<I + 1>(dest, src);
-}
-
-// call member function
-template <typename C,
-          typename FC,
-          typename R,
-          typename... FArgs,
-          typename... Args>
-R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
-  return (obj.*f)(args...);
-}
-
-template <typename T>
-class ReturnType {
- public:
-  typedef T type;
-};
-
-template <>
-class ReturnType<CpuMatrix> {
- public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReturnType<CpuIVector> {
- public:
-  typedef GpuIVector type;
-};
-
-template <>
-class ReturnType<CpuSparseMatrix> {
- public:
-  typedef GpuSparseMatrix type;
-};
-
-template <typename T>
-typename ReturnType<T>::type autoArgs(T& v) {
-  return v;
-}
-
-template <>
-GpuMatrix autoArgs(CpuMatrix& v) {
-  GpuMatrix a(v.getHeight(), v.getWidth());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuIVector autoArgs(CpuIVector& v) {
-  GpuIVector a(v.getSize());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
-  GpuSparseMatrix a(v.getHeight(),
-                    v.getWidth(),
-                    v.getElementCnt(),
-                    v.getValueType(),
-                    v.getFormat());
-  a.copyFrom(v, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return a;
-}
-
-class AutoCompare {
- public:
-  /**
-   * err is the allowed calculation error.
-   * The smaller the value of err,
-   * the stricter the comparison is between CPU and GPU calculations.
-   */
-  AutoCompare(size_t height, size_t width, real err = 1e-3)
-      : cpu(height, width), gpu(height, width), compare(err) {
-    init(cpu);
-    copy(gpu, cpu);
-  }
-
-  template <typename C, typename R, typename... FArgs, typename... Args>
-  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
-    static_assert(sizeof...(FArgs) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    call(cpu, f, args...);
-    call(gpu, f, autoArgs(args)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
-  template <std::size_t... I, typename C, typename R, typename... Args>
-  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
-    static_assert(sizeof...(I) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    (void)height;
-    (void)width;
-    auto tuple1 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            CpuMatrix>::type>(height, width)...);
-
-    auto tuple2 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            GpuMatrix>::type>(height, width)...);
-
-    initTuple(tuple1);
-    copyTuple(tuple2, tuple1);
-
-    call(cpu, f, std::get<I>(tuple1)...);
-    call(gpu, f, std::get<I>(tuple2)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
- protected:
-  CpuMatrix cpu;
-  GpuMatrix gpu;
-  AssertEqual compare;
-};
-
-}  // namespace autotest
diff --git a/paddle/legacy/math/tests/test_Allocator.cpp b/paddle/legacy/math/tests/test_Allocator.cpp
deleted file mode 100644
index 122be9082..000000000
--- a/paddle/legacy/math/tests/test_Allocator.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-#define private public
-#include "paddle/legacy/math/Allocator.h"
-#include "paddle/legacy/math/MemoryHandle.h"
-#include "paddle/legacy/math/PoolAllocator.h"
-
-using namespace paddle;  // NOLINT
-
-template <typename Allocator>
-void testPoolAllocator() {
-  PoolAllocator* pool =
-      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
-
-  /* alloc from system memory */
-  void* ptr1 = pool->alloc(10);
-  void* ptr2 = pool->alloc(200);
-  void* ptr3 = pool->alloc(200);
-  pool->free(ptr1, 10);
-  pool->free(ptr2, 200);
-  pool->free(ptr3, 200);
-  pool->printAll();
-  EXPECT_EQ((size_t)2, pool->pool_.size());
-  EXPECT_EQ((size_t)1, pool->pool_[10].size());
-  EXPECT_EQ((size_t)2, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, pool->pool_[10][0]);
-  EXPECT_EQ(ptr2, pool->pool_[200][0]);
-  EXPECT_EQ(ptr3, pool->pool_[200][1]);
-
-  /* alloc from pool */
-  void* ptr4 = pool->alloc(10);
-  void* ptr5 = pool->alloc(200);
-  pool->printAll();
-  EXPECT_EQ((size_t)0, pool->pool_[10].size());
-  EXPECT_EQ((size_t)1, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, ptr4);
-  EXPECT_EQ(ptr3, ptr5);
-  pool->free(ptr4, 10);
-  pool->free(ptr5, 200);
-
-  /* alloc size > sizeLimit */
-  void* ptr6 = pool->alloc(1024);
-  pool->free(ptr6, 1024);
-  EXPECT_LE((size_t)1024, pool->poolMemorySize_);
-
-  void* ptr7 = pool->alloc(1);
-  EXPECT_EQ((size_t)0, pool->poolMemorySize_);
-  EXPECT_EQ((size_t)0, pool->pool_.size());
-  pool->free(ptr7, 1);
-
-  delete pool;
-}
-
-TEST(Allocator, Pool) {
-  testPoolAllocator<CpuAllocator>();
-#ifdef PADDLE_WITH_CUDA
-  testPoolAllocator<GpuAllocator>();
-#endif
-}
-
-TEST(MemoryHandle, Cpu) {
-  for (auto size : {10, 30, 50, 100, 200, 512, 1000, 1023, 1024, 1025, 8193}) {
-    CpuMemoryHandle handle(size);
-    EXPECT_LE(handle.getSize(), handle.getAllocSize());
-  }
-
-  void* ptr1;
-  void* ptr2;
-  {
-    CpuMemoryHandle handle(256);
-    ptr1 = handle.getBuf();
-  }
-  {
-    CpuMemoryHandle handle(256);
-    ptr2 = handle.getBuf();
-  }
-  EXPECT_EQ(ptr1, ptr2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(MemoryHandle, Gpu) {
-  int numGpu = hl_get_device_count();
-
-  /* alloc from system memory */
-  void* ptr3[numGpu];
-  void* ptr4[numGpu];
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle2(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    ptr3[i] = handle3.getBuf();
-    ptr4[i] = handle4.getBuf();
-  }
-
-  /* alloc from pool */
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    EXPECT_EQ(ptr3[i], handle3.getBuf());
-    EXPECT_EQ(ptr4[i], handle4.getBuf());
-  }
-}
-#endif
diff --git a/paddle/legacy/math/tests/test_BaseMatrix.cpp b/paddle/legacy/math/tests/test_BaseMatrix.cpp
deleted file mode 100644
index 488765c6a..000000000
--- a/paddle/legacy/math/tests/test_BaseMatrix.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/**
- * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
- * implementation of CPU and GPU member function in
- * BaseMatrix.cpp and Matrix.cpp.
- */
-
-#include <gtest/gtest.h>
-#include "TestUtils.h"
-#include "paddle/legacy/math/BaseMatrix.h"
-
-using paddle::BaseMatrix;
-using paddle::Matrix;
-using autotest::AutoCompare;
-
-// Test all void (BaseMatrix::*)() function
-TEST(BaseMatrix, void) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)()) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg(f, height, width);
-      };
-
-      compare(&BaseMatrix::neg);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::zero);
-      compare(&BaseMatrix::one);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real) function
-TEST(BaseMatrix, real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::biggerThanScalar);
-      compare(&BaseMatrix::downClip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::relu);
-      compare(&BaseMatrix::reluDerivative);
-      compare(&BaseMatrix::softrelu);
-      compare(&BaseMatrix::softreluDerivative);
-      compare(&BaseMatrix::brelu);
-      compare(&BaseMatrix::breluDerivative);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::squareDerivative);
-      compare(&BaseMatrix::tanh);
-      compare(&BaseMatrix::tanhDerivative);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::reciprocalDerivative);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::absDerivative);
-      compare(&BaseMatrix::sigmoid);
-      compare(&BaseMatrix::sigmoidDerivative);
-      compare(&BaseMatrix::expDerivative);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareMul);
-      compare(&BaseMatrix::addColVector);
-      compare(&BaseMatrix::addRowVector);
-      compare(&BaseMatrix::mulRowVector);
-      compare(&BaseMatrix::divRowVector);
-      compare(&BaseMatrix::mulColVector);
-      compare(&BaseMatrix::divColVector);
-      compare(&BaseMatrix::addP2P);
-      compare(&BaseMatrix::invSqrt);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real, real) function
-TEST(BaseMatrix, real_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::clip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
-TEST(BaseMatrix, BaseMatrix_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::addBias);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::addScalar);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::scalarDiv);
-      compare(&BaseMatrix::addSquare);
-      compare(&BaseMatrix::isEqualTo);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height,
-                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::softCrossEntropy);
-      compare(&BaseMatrix::softCrossEntropyBp);
-      compare(&BaseMatrix::binaryLabelCrossEntropy);
-      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::add2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotDiv);
-      compare(&BaseMatrix::logisticRegressionLoss);
-      compare(&BaseMatrix::logisticRegressionLossBp);
-      compare(&BaseMatrix::biggerThan);
-      compare(&BaseMatrix::max2);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareSquare);
-    }
-  }
-}
-
-void TestEelementWise(size_t height, size_t width) {
-  AutoCompare rowScale(height, width);
-  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
-
-  AutoCompare rowDotMul(height, width);
-  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
-
-  AutoCompare binaryClassificationError(height, width);
-  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
-      &BaseMatrix::binaryClassificationError, height, width);
-
-  AutoCompare sumOfSquaresBp(height, width);
-  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
-}
-
-void TestAggregateToRow(size_t height, size_t width) {
-  AutoCompare maxCols(1, width);
-  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
-
-  AutoCompare minCols(1, width);
-  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
-
-  AutoCompare addDotMulVMM(1, width);
-  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
-
-  AutoCompare sumCols(1, width);
-  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
-
-  AutoCompare collectBias(1, width);
-  collectBias.cmpWithoutArg<0, 1>(
-      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
-      height,
-      width);
-}
-
-void TestAggregateToCol(size_t height, size_t width) {
-  AutoCompare maxRows(height, 1);
-  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
-
-  AutoCompare minRows(height, 1);
-  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
-
-  AutoCompare sumRows(height, 1);
-  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
-
-  AutoCompare sumOfSquares(height, 1);
-  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
-}
-
-TEST(BaseMatrix, Other) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      TestEelementWise(height, width);
-      TestAggregateToRow(height, width);
-      TestAggregateToCol(height, width);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_CpuGpuVector.cpp b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
deleted file mode 100644
index 010fef534..000000000
--- a/paddle/legacy/math/tests/test_CpuGpuVector.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(CpuGpuVector, getData) {
-  size_t size = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuVectorPtr cpuVec = std::make_shared<CpuVector>(size);
-  GpuVectorPtr gpuVec = std::make_shared<GpuVector>(size);
-  cpuVec->uniform(0.0, 10.0);
-  gpuVec->copyFrom(*cpuVec, stream);
-  hl_stream_synchronize(stream);
-
-  CpuGpuVectorPtr vec = std::make_shared<CpuGpuVector>(gpuVec);
-  auto a = vec->getData(false);
-  auto b = cpuVec->getData();
-  hl_stream_synchronize(stream);
-  checkDataEqual(a, b, size);
-}
-
-TEST(CpuGpuVector, subCreate) {
-  size_t size1 = 1024;
-  size_t offset = 100;
-  size_t size2 = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuGpuVectorPtr v1 = std::make_shared<CpuGpuVector>(size1, /*useGpu*/ false);
-  auto vec = v1->getMutableVector(false);
-  vec->uniform(0.0, 10.0);
-  auto v2 = std::make_shared<CpuGpuVector>(*v1, offset, size2);
-  CHECK_EQ(*v1->getSync(), *v2->getSync());
-
-  // check subVec equal
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  CpuVectorPtr v1Check = std::make_shared<CpuVector>(size1);
-  CpuVectorPtr v2Check = std::make_shared<CpuVector>(size2);
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  checkDataEqual(v2->getData(false), v2Check->getData(), size2);
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-
-  CpuVectorPtr noise = std::make_shared<CpuVector>(size2);
-  noise->uniform(0.0, 1.0);
-  auto v = v2->getMutableVector(false);  // will change header
-  // add noise to subVec
-  v->add(*noise);
-
-  // check v1_cpu_data == v2_cpu_data
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  // check v1_gpu_data == v2_gpu_data
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_ExecViaCpu.cpp b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
deleted file mode 100644
index b2ce0bc7e..000000000
--- a/paddle/legacy/math/tests/test_ExecViaCpu.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <paddle/legacy/utils/Util.h>
-#include <vector>
-#include "paddle/legacy/math/SparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-
-const int height = 10;
-const int width = 16;
-
-real f(Matrix& mat1,
-       const Matrix& mat2,
-       IVector& vec1,
-       const IVector& vec2,
-       real scalar) {
-  CHECK(!mat1.useGpu());
-  CHECK(!mat2.useGpu());
-  CHECK(!vec1.useGpu());
-  CHECK(!vec2.useGpu());
-  mat1.copyFrom(mat2);
-  vec1.copyFrom(vec2);
-
-  return scalar;
-}
-
-class Functor {
- public:
-  real operator()(Matrix& mat1,
-                  const Matrix& mat2,
-                  IVector& vec1,
-                  const IVector& vec2,
-                  real scalar) {
-    a_ = f(mat1, mat2, vec1, vec2, scalar);
-    return a_;
-  }
-
- private:
-  real a_;
-};
-
-template <typename F>
-void testWrapper(F&& f) {
-  MatrixPtr cpumat1 = Matrix::create(height, width, false, /*useGpu=*/false);
-  MatrixPtr cpumat2 = Matrix::create(height, width, false, /*useGpu=*/false);
-
-  IVectorPtr cpuvec1 = IVector::create(height, /*useGpu=*/false);
-  IVectorPtr cpuvec2 = IVector::create(height, /*useGpu=*/false);
-
-  const real scalar = 1.23456;
-
-  MatrixPtr gpumat1 = Matrix::create(height, width, false, /*useGpu=*/true);
-  MatrixPtr gpumat2 = Matrix::create(height, width, false, /*useGpu=*/true);
-  IVectorPtr gpuvec1 = IVector::create(height, /*useGpu=*/true);
-  IVectorPtr gpuvec2 = IVector::create(height, /*useGpu=*/true);
-
-  cpumat2->randomizeUniform();
-  cpuvec2->rand(width);
-  gpumat2->copyFrom(*cpumat2);
-  gpuvec2->copyFrom(*cpuvec2);
-
-  real ret = execViaCpu(f, *gpumat1, *gpumat2, *gpuvec1, *gpuvec2, 1.23456);
-  EXPECT_EQ(ret, scalar);
-  cpumat1->copyFrom(*gpumat1);
-  cpuvec1->copyFrom(*gpuvec1);
-
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpuvec1->getElement(i), cpuvec2->getElement(i));
-    for (int j = 0; j < width; ++j) {
-      EXPECT_EQ(cpumat1->getElement(i, j), cpumat2->getElement(i, j));
-    }
-  }
-  gpumat1->resize(height, 1);
-  execViaCpu2(&CpuMatrix::selectElements, *gpumat1, *gpumat2, *gpuvec1);
-
-  cpumat1->resize(height, 1);
-  cpumat1->selectElements(*cpumat2, *cpuvec1);
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpumat1->getElement(i, 0), gpumat1->getElement(i, 0));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(ExecViaCpu, test1) {
-  testWrapper(f);
-  testWrapper(&f);
-
-  auto lambda = [](Matrix& mat1,
-                   const Matrix& mat2,
-                   IVector& vec1,
-                   const IVector& vec2,
-                   real scalar) -> real {
-    return f(mat1, mat2, vec1, vec2, scalar);
-  };
-  LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
-            << " is_function=" << std::is_function<decltype(lambda)>::value;
-  testWrapper(lambda);
-
-  Functor functor;
-  testWrapper(functor);
-}
-#endif
diff --git a/paddle/legacy/math/tests/test_FPException.cpp b/paddle/legacy/math/tests/test_FPException.cpp
deleted file mode 100644
index aa6aea71c..000000000
--- a/paddle/legacy/math/tests/test_FPException.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/**
- * This test is about floating point calculation exception.
- * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
- *
- * Some exceptions occur in the middle of a set of formulas,
- * that can be circumvented by some tricks.
- * For example,
- * calculate tanh
- *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
- *
- * If the result of (-2 * a) is too large,
- * a FE_OVERFLOW exception occurs when calculating exp.
- * But the result of tanh is no overflow problem,
- * so we can add some tricks to prevent exp calculate an excessive value.
- *
- */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Common.h"
-
-using namespace paddle;  // NOLINT
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-template <typename Matrix>
-void testTanh(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->tanh(*B);
-}
-
-template <typename Matrix>
-void testSigmoid(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->sigmoid(*B);
-}
-
-TEST(fp, overflow) {
-  for (auto illegal : {-90.0, 90.0}) {
-    LOG(INFO) << " illegal=" << illegal;
-    testTanh<CpuMatrix>(illegal);
-    testSigmoid<CpuMatrix>(illegal);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/math/tests/test_GpuProfiler.cpp b/paddle/legacy/math/tests/test_GpuProfiler.cpp
deleted file mode 100644
index ee27109f2..000000000
--- a/paddle/legacy/math/tests/test_GpuProfiler.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    target->bilinearForward(*input,
-                            imgSizeH,
-                            imgSizeW,
-                            2 * imgSizeH,
-                            2 * imgSizeW,
-                            channels,
-                            ratioH,
-                            ratioW);
-    targetGpu->bilinearForward(*inputGpu,
-                               imgSizeH,
-                               imgSizeW,
-                               2 * imgSizeH,
-                               2 * imgSizeW,
-                               channels,
-                               ratioH,
-                               ratioW);
-  }
-
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->bilinearBackward(*targetGrad,
-                              2 * imgSizeH,
-                              2 * imgSizeW,
-                              imgSizeH,
-                              imgSizeW,
-                              channels,
-                              ratioH,
-                              ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad,
-                                 2 * imgSizeH,
-                                 2 * imgSizeW,
-                                 imgSizeH,
-                                 imgSizeW,
-                                 channels,
-                                 ratioH,
-                                 ratioW);
-
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
-}
-
-TEST(Profiler, testBilinearFwdBwd) {
-  auto numSamples = 10;
-  auto channels = 16;
-  auto imgSize = 64;
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    // Paddle built-in timer
-    REGISTER_TIMER_INFO(
-        "testBilinearFwdBwd",
-        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
-  }
-  globalStat.printAllStatus();
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  // nvprof: GPU Proflier
-  REGISTER_GPU_PROFILER(
-      "RecursiveProfilingTest",
-      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_Matrix.cpp b/paddle/legacy/math/tests/test_Matrix.cpp
deleted file mode 100644
index a9407a31f..000000000
--- a/paddle/legacy/math/tests/test_Matrix.cpp
+++ /dev/null
@@ -1,273 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/**
- * This test file use autotest::AutoCompare and cmpWithArg to compares the
- * implementation of CPU and GPU member function in Matrix.cpp.
- */
-
-#include <gtest/gtest.h>
-#include "TestUtils.h"
-
-using paddle::BaseMatrix;
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::CpuIVector;
-using paddle::CpuSparseMatrix;
-using autotest::AutoCompare;
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-
-  AutoCompare forward(numSamples, outWidth);
-  CpuMatrix arg1(numSamples, inWidth);
-  arg1.randomizeUniform();
-  forward.cmpWithArg(&Matrix::bilinearForward,
-                     arg1,
-                     imgSizeH,
-                     imgSizeW,
-                     2 * imgSizeH,
-                     2 * imgSizeW,
-                     channels,
-                     ratioH,
-                     ratioW);
-
-  AutoCompare backward(numSamples, inWidth);
-  CpuMatrix arg2(numSamples, outWidth);
-  arg2.randomizeUniform();
-  backward.cmpWithArg(&Matrix::bilinearBackward,
-                      arg2,
-                      2 * imgSizeH,
-                      2 * imgSizeW,
-                      imgSizeH,
-                      imgSizeW,
-                      channels,
-                      ratioH,
-                      ratioW);
-}
-
-TEST(Matrix, BilinearFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-          testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels);
-        }
-      }
-    }
-  }
-}
-
-void testMatrixAddBias(int height, int width, real scale) {
-  AutoCompare test(height, width);
-  CpuMatrix arg1(1, width);
-  arg1.randomizeUniform();
-  test.cmpWithArg(
-      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::addBias),
-      arg1,
-      scale);
-}
-
-void testMatrixAddDotMulMMV(int height, int width) {
-  AutoCompare test(height, width);
-  CpuMatrix arg1(height, width);
-  CpuMatrix arg2(1, width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  test.cmpWithArg(&BaseMatrix::addDotMulMMV, arg1, arg2);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testMatrixAddBias(height, width, 1.0);
-      testMatrixAddBias(height, width, 3.5);
-      testMatrixAddDotMulMMV(height, width);
-    }
-  }
-}
-
-void testMatrixAddAtOffset(int height, int width1, int width2, int offset) {
-  AutoCompare test(height, width2);
-  CpuMatrix arg1(height, width1);
-  arg1.randomizeUniform();
-  test.cmpWithArg(&Matrix::addAtOffset, arg1, offset);
-}
-
-void testMatrixAssignAtOffset(int height, int width1, int width2, int offset) {
-  AutoCompare test(height, width2);
-  CpuMatrix arg1(height, width1);
-  arg1.randomizeUniform();
-  test.cmpWithArg(&Matrix::assignAtOffset, arg1, offset);
-}
-
-TEST(Matrix, AtOffset) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width1 : {1, 32, 100, 512, 1000}) {
-      for (auto width2 : {1, 32, 100, 512, 1000}) {
-        int columnOffset = 0;
-        int offset = std::abs(width1 - width2);
-        if (offset) {
-          columnOffset = std::rand() % offset;
-        }
-        VLOG(3) << " height=" << height << " width1=" << width1
-                << " width2=" << width2 << " columnOffset = " << columnOffset;
-        testMatrixAddAtOffset(height, width1, width2, columnOffset);
-        testMatrixAssignAtOffset(height, width1, width2, columnOffset);
-      }
-    }
-  }
-}
-
-void testMatrixSelectRows(int numSamples, int tableSize, int inputDim) {
-  AutoCompare test(numSamples, inputDim);
-  CpuMatrix arg1(tableSize, inputDim);
-  CpuIVector arg2(numSamples);
-  arg1.randomizeUniform();
-  arg2.rand(tableSize);
-  test.cmpWithArg(&Matrix::selectRows, arg1, arg2);
-}
-
-TEST(Matrix, tableProjection) {
-  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
-    for (auto tableSize : {10, 100}) {
-      for (auto inputDim : {20, 50}) {
-        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                << " inputDim=" << inputDim;
-        testMatrixSelectRows(numSamples, tableSize, inputDim);
-      }
-    }
-  }
-}
-
-void testMatrixCopyByRowIndex(int outHeight, int inHeight, int width) {
-  AutoCompare test(outHeight, width);
-  CpuMatrix arg1(inHeight, width);
-  CpuIVector arg2(outHeight);
-  arg1.randomizeUniform();
-  arg2.rand(inHeight);
-  test.cmpWithArg(&Matrix::copyByRowIndex, arg1, arg2);
-}
-
-TEST(Matrix, copyByRowIndex) {
-  for (auto outHeight : {31, 500, 1000}) {
-    for (auto inHeight : {17, 257, 500, 1200}) {
-      for (auto width : {512, 1024}) {
-        VLOG(3) << outHeight << " " << inHeight << " " << width;
-        testMatrixCopyByRowIndex(outHeight, inHeight, width);
-      }
-    }
-  }
-}
-
-void testParamReluForward(int height, int width, int w_height, int w_width) {
-  AutoCompare test(height, width);
-  CpuMatrix arg1(height, width);
-  CpuMatrix arg2(w_height, w_width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  arg1.add(-0.5);
-  test.cmpWithArg(&Matrix::paramReluForward, arg1, arg2);
-}
-
-void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
-  AutoCompare test(w_height, w_width);
-  CpuMatrix arg1(height, width);
-  CpuMatrix arg2(height, width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  arg2.add(-0.5);
-  test.cmpWithArg(&Matrix::paramReluBackwardW, arg1, arg2);
-}
-
-TEST(Matrix, paramRelu) {
-  for (auto height : {10, 40, 100}) {
-    for (auto width : {10, 40, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          if (width % (w_height * w_width)) continue;
-          testParamReluForward(height, width, w_height, w_width);
-          testParamReluBackwardW(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testAddSharedBias(int numSamples, int dim, int channel) {
-  AutoCompare test(numSamples, dim);
-  CpuMatrix arg1(1, channel);
-  arg1.randomizeUniform();
-  test.cmpWithArg(&Matrix::addSharedBias, arg1, 1.0);
-}
-
-void testCollectSharedBias(int numSamples, int dim, int channel) {
-  AutoCompare test(1, channel);
-  CpuMatrix arg1(numSamples, dim);
-  arg1.randomizeUniform();
-  test.cmpWithArg(&Matrix::collectSharedBias, arg1, 1.0);
-}
-
-TEST(Matrix, sharedBias) {
-  for (auto numSamples : {1, 100, 520}) {
-    for (auto dim : {100 * 16, 100 * 32}) {
-      for (auto channel : {8, 16}) {
-        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
-                << " channel=" << channel;
-        testAddSharedBias(numSamples, dim, channel);
-        testCollectSharedBias(numSamples, dim, channel);
-      }
-    }
-  }
-}
-
-void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) {
-  AutoCompare forward(numSamples, 1);
-  CpuMatrix arg1(numSamples, dim);
-  CpuSparseMatrix arg2(
-      numSamples, dim, numSamples, paddle::NO_VALUE, paddle::SPARSE_CSR);
-
-  CpuMatrix output1(numSamples, dim);
-  output1.randomizeUniform();
-  output1.softmax(arg1);
-  for (int i = 0; i < numSamples; i++) {
-    const unsigned int id = std::rand() % dim;
-    arg2.setRow(i, 1, &id, nullptr);
-  }
-  forward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropy, arg1, arg2);
-
-  AutoCompare backward(numSamples, dim);
-  backward.cmpWithArg(&Matrix::multiBinaryLabelCrossEntropyBp, arg1, arg2);
-}
-
-TEST(Matrix, multiBinaryCrossEntropy) {
-  for (auto numSamples : {100, 1000, 10000}) {
-    for (auto dim : {100, 1000, 10000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testMultiBinaryLabelCrossEntropy(numSamples, dim);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_RowBuffer.cpp b/paddle/legacy/math/tests/test_RowBuffer.cpp
deleted file mode 100644
index 2ef8cd303..000000000
--- a/paddle/legacy/math/tests/test_RowBuffer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/RowBuffer.h"
-
-TEST(RowBuffer, testAutoGrow) {
-  paddle::RowBuffer buf(128);
-  ASSERT_EQ(128UL, buf.getWidth());
-  ASSERT_TRUE(buf.isAutoGrowth());
-  buf.resize(2);
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-
-  auto data = buf.getWithAutoGrowth(2);
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    data[i] = i;
-  }
-
-  ASSERT_EQ(3UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
-  }
-}
-
-TEST(RowBuffer, testWithMemBuf) {
-  paddle::CpuMemHandlePtr mem =
-      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
-  paddle::RowBuffer buf(mem, 128);
-  ASSERT_TRUE(!buf.isAutoGrowth());
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
-    }
-  }
-
-  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
-}
diff --git a/paddle/legacy/math/tests/test_SIMDFunctions.cpp b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
deleted file mode 100644
index c6490f70e..000000000
--- a/paddle/legacy/math/tests/test_SIMDFunctions.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/math/SIMDFunctions.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <random>
-
-#include <stdlib.h>
-#include <time.h>
-
-static constexpr size_t VECTOR_LEN = 3072;
-static constexpr size_t BATCH_SIZE = 64;
-static constexpr size_t ALIGN = 32;
-static_assert(VECTOR_LEN % ALIGN == 0, "VECTOR_LEN % ALIGN == 0");
-static_assert(BATCH_SIZE % ALIGN == 0, "BATCH_SIZE % ALIGN == 0");
-static constexpr float EPSILON = 1e-5;
-static std::mt19937 RandomEngine(time(0));
-
-inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
-                                                 size_t align = ALIGN) {
-  float* ptr;
-  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
-  return std::unique_ptr<float[]>(ptr);
-}
-
-inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
-                                                       size_t align = ALIGN) {
-  std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
-  auto generator = std::bind(dist, RandomEngine);
-  auto retv = NewVector(len, align);
-  std::generate_n(retv.get(), len, generator);
-  return retv;
-}
-
-TEST(SIMDFunction, addTo) {
-  typedef std::function<void(float*, const float*, size_t)> AddToMethodType;
-
-  AddToMethodType naive = paddle::simd::naive::addTo<float>;
-  AddToMethodType simd = paddle::simd::addTo<float>;
-
-  auto A = NewRandomVector();
-  auto B = NewRandomVector();
-
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), VECTOR_LEN * sizeof(float));
-
-  naive(A.get(), B.get(), VECTOR_LEN);
-  simd(ACopy.get(), B.get(), VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, batchAddTo) {
-  auto A = NewRandomVector();
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), sizeof(float) * VECTOR_LEN);
-
-  std::vector<std::unique_ptr<float[]>> B;
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    B.emplace_back(NewRandomVector());
-  }
-  std::unique_ptr<float* []> BRaw(new float*[BATCH_SIZE]);
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    BRaw[i] = B[i].get();
-  }
-
-  typedef std::function<void(float*, const float**, int, size_t)>
-      BatchAddToMethodType;
-
-  BatchAddToMethodType naive = paddle::simd::naive::batchAddTo<float>;
-  BatchAddToMethodType simd = paddle::simd::batchAddTo<float>;
-
-  naive(A.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(ACopy.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, colMax) {
-  auto A = NewRandomVector(VECTOR_LEN * BATCH_SIZE);
-  auto naiveResult = NewVector(BATCH_SIZE);
-  auto simdResult = NewVector(BATCH_SIZE);
-
-  typedef std::function<void(float*, const float*, int, int)> ColMaxMethodType;
-  ColMaxMethodType naive = paddle::simd::naive::colMax<float>;
-  ColMaxMethodType simd = paddle::simd::colMax<float>;
-
-  naive(naiveResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(simdResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    ASSERT_NEAR(naiveResult[i], simdResult[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lr = NewRandomVector();
-  auto lambda = 0.23f;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float*, float, size_t)>
-      DecayL1MethodType;
-
-  DecayL1MethodType naive = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
-  };
-
-  DecayL1MethodType simd = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, lr, l, len);
-  };
-
-  naive(dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithoutLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lambda = 0.23;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float, size_t)> DecayL1MethodType;
-
-  DecayL1MethodType naive = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, l, len);
-  };
-
-  DecayL1MethodType simd = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, l, len);
-  };
-
-  naive(dest.get(), src.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
diff --git a/paddle/legacy/math/tests/test_SparseMatrix.cpp b/paddle/legacy/math/tests/test_SparseMatrix.cpp
deleted file mode 100644
index 30896a945..000000000
--- a/paddle/legacy/math/tests/test_SparseMatrix.cpp
+++ /dev/null
@@ -1,565 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <vector>
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(Matrix, CopyCpuMatrixToSparseMatrix) {
-  const size_t HEIGHT = 20;
-  const size_t WIDTH = 10;
-  const size_t WIDTH_TEST = 15;
-  MatrixPtr testMatrix(
-      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 5, FLOAT_VALUE, SPARSE_CSR));
-  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
-  testCpuMatrix->randomizeUniform();
-  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
-  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
-  mulCpuMatrix->randomizeUniform();
-  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST)),
-      ret2(new CpuMatrix(HEIGHT, WIDTH_TEST));
-  ret1->zeroMem();
-  ret2->zeroMem();
-  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(*testCpuMatrix, *mulCpuMatrix, 1.0, 1.0);
-  checkMatrixEqual(ret1, ret2);
-}
-
-struct MatrixPara {
-  size_t height;
-  size_t width;
-  bool trans;
-  bool sparse;
-  size_t nnz;
-  SparseFormat format;
-};
-
-#ifdef PADDLE_WITH_CUDA
-void test_sparse_matrix_mul(MatrixPara paraA,
-                            MatrixPara paraB,
-                            MatrixPara paraC) {
-  // for cpu sparse matrix mul
-  MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
-  // for gpu sparse matrix mul
-  MatrixPtr gpuMatrixA, gpuMatrixB, gpuMatrixC;
-  // for cpu dense matrix mul
-  MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
-
-  if (paraA.sparse) {
-    cpuMatrixA = Matrix::createSparseMatrix(paraA.height,
-                                            paraA.width,
-                                            paraA.nnz,
-                                            FLOAT_VALUE,
-                                            paraA.format,
-                                            paraA.trans,
-                                            false);
-    gpuMatrixA = Matrix::createSparseMatrix(paraA.height,
-                                            paraA.width,
-                                            paraA.nnz,
-                                            FLOAT_VALUE,
-                                            paraA.format,
-                                            paraA.trans,
-                                            true);
-  } else {
-    cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-    gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
-  }
-  cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-
-  if (paraB.sparse) {
-    cpuMatrixB = Matrix::createSparseMatrix(paraB.height,
-                                            paraB.width,
-                                            paraB.nnz,
-                                            FLOAT_VALUE,
-                                            paraB.format,
-                                            paraB.trans,
-                                            false);
-    gpuMatrixB = Matrix::createSparseMatrix(paraB.height,
-                                            paraB.width,
-                                            paraB.nnz,
-                                            FLOAT_VALUE,
-                                            paraB.format,
-                                            paraB.trans,
-                                            true);
-  } else {
-    cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
-    gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
-  }
-  cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
-
-  if (paraC.sparse) {
-    cpuMatrixC = Matrix::createSparseMatrix(paraC.height,
-                                            paraC.width,
-                                            paraC.nnz,
-                                            FLOAT_VALUE,
-                                            paraC.format,
-                                            paraC.trans,
-                                            false);
-    gpuMatrixC = Matrix::createSparseMatrix(paraC.height,
-                                            paraC.width,
-                                            paraC.nnz,
-                                            FLOAT_VALUE,
-                                            paraC.format,
-                                            paraC.trans,
-                                            true);
-    gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height,
-                                                paraC.width,
-                                                paraC.nnz,
-                                                FLOAT_VALUE,
-                                                paraC.format,
-                                                paraC.trans,
-                                                false);
-  } else {
-    cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-    gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
-    gpuMatrixC_d2h =
-        Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-  }
-  cpuDenseC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-
-  /*matrix init*/
-  hl_stream_t stream(HPPL_STREAM_1);
-  cpuMatrixA->randomizeUniform();
-  cpuMatrixB->randomizeUniform();
-  cpuMatrixC->randomizeUniform();
-
-  gpuMatrixA->copyFrom(*cpuMatrixA, stream);
-  gpuMatrixB->copyFrom(*cpuMatrixB, stream);
-  gpuMatrixC->copyFrom(*cpuMatrixC, stream);
-
-  cpuDenseA->copyFrom(*cpuMatrixA);
-  cpuDenseB->copyFrom(*cpuMatrixB);
-  cpuDenseC->copyFrom(*cpuMatrixC);
-
-  hl_stream_synchronize(stream);
-
-  /*matrix mul*/
-  cpuMatrixC->mul(*cpuMatrixA, *cpuMatrixB, 1.0, 1.0);
-  gpuMatrixC->mul(*gpuMatrixA, *gpuMatrixB, 1.0, 1.0);
-  cpuDenseC->mul(*cpuDenseA, *cpuDenseB, 1.0, 1.0);
-
-  gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
-  hl_stream_synchronize(stream);
-
-  /*check result*/
-  if (paraC.sparse) {
-    checkSMatrixEqual(
-        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
-        std::dynamic_pointer_cast<CpuSparseMatrix>(gpuMatrixC_d2h));
-    checkSMatrixEqual2Dense(
-        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
-        std::dynamic_pointer_cast<CpuMatrix>(cpuDenseC));
-  } else {
-    checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
-    checkMatrixEqual(cpuMatrixC, cpuDenseC);
-  }
-}
-
-TEST(Matrix, SparseMatrixMul) {
-  const size_t DIM_M = 4;
-  const size_t DIM_N = 4;
-  const size_t DIM_K = 8;
-  const size_t NNZ = 5;
-  for (auto format : {SPARSE_CSC, SPARSE_CSR}) {
-    std::string str_format = format == SPARSE_CSC ? "CSC" : "CSR";
-    LOG(INFO) << "test dense mul " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
-
-    LOG(INFO) << "test dense mul " << str_format << "  trans";
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_N, DIM_K, /*trans*/ true, /*sparse*/ true, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
-
-    LOG(INFO) << "test dense mul dense 2 " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
-
-    LOG(INFO) << "test denseT mul dense 2 " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_K, DIM_M, /*trans*/ true, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
-  }
-}
-
-TEST(Matrix, CopySparseMatrixToGpuSparseMatrix) {
-  const size_t HEIGHT = 20;
-  const size_t WIDTH = 10;
-  const size_t WIDTH_TEST = 15;
-  MatrixPtr testMatrix(
-      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 2, FLOAT_VALUE, SPARSE_CSR));
-  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
-  testCpuMatrix->randomizeUniform();
-  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
-
-  MatrixPtr testGpuMatrix = testMatrix->clone(HEIGHT, WIDTH, true);
-  hl_stream_t gpuStream(HPPL_STREAM_3);
-  testGpuMatrix->copyFrom(*testMatrix, gpuStream);
-  hl_stream_synchronize(gpuStream);
-
-  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
-  mulCpuMatrix->randomizeUniform();
-  MatrixPtr mulGpuMatrix(new GpuMatrix(WIDTH, WIDTH_TEST));
-  mulGpuMatrix->copyFrom(*mulCpuMatrix);
-  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST));
-  MatrixPtr ret2(new GpuMatrix(HEIGHT, WIDTH_TEST));
-  ret1->zeroMem();
-  ret2->zeroMem();
-  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(*testGpuMatrix, *mulGpuMatrix, 1.0, 1.0);
-  checkMatrixEqual(ret1, ret2);
-}
-
-#endif
-
-TEST(Matrix, SparseMatrixTranspose) {
-  for (auto height : {10, 50, 100}) {
-    for (auto width : {10, 50, 100}) {
-      auto nnz = height * width;
-      for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
-        for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
-          for (auto sparseRate : {0.1, 0.2, 0.5}) {
-            MatrixPtr matA = Matrix::createSparseMatrix(
-                height, width, size_t(nnz * sparseRate), valueType, format);
-            MatrixPtr matB(new CpuSparseMatrix(
-                width, height, size_t(nnz * sparseRate), valueType, format));
-            matA->randomizeUniform();
-            matA->transpose(matB, false);
-
-            /*dense matrix transpose*/
-            CpuMatrixPtr matC(new CpuMatrix(height, width));
-            matC->copyFrom(*matA);
-            MatrixPtr matD(new CpuMatrix(width, height));
-            matC->transpose(matD, false);
-
-            /*check result*/
-            checkSMatrixEqual2Dense(
-                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
-                std::dynamic_pointer_cast<CpuMatrix>(matD));
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixSubMatrix) {
-  const size_t HEIGHT = 10;
-  const size_t WIDTH = 10;
-  const size_t NNZ = HEIGHT * WIDTH;
-  for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
-    size_t startRow = 3;
-    size_t rowNum = 2;
-    real sparseRate = 0.1;
-    /*sparse matrix init and get subMatrix*/
-    CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-        HEIGHT, WIDTH, size_t(NNZ * sparseRate), valueType, SPARSE_CSR);
-    matA->randomizeUniform();
-    CpuSparseMatrixPtr matB = std::dynamic_pointer_cast<CpuSparseMatrix>(
-        matA->subMatrix(startRow, rowNum));
-
-    int start = matA->getRows()[startRow];
-    int end = matA->getRows()[startRow + rowNum];
-
-    /*compare two matrix*/
-    ASSERT_EQ(matB->getElementCnt(), size_t(end - start));
-    if (valueType == FLOAT_VALUE) {
-      for (size_t i = 0; i < matB->getElementCnt(); i++) {
-        ASSERT_FLOAT_EQ(matB->getValue()[start + i],
-                        matA->getValue()[start + i]);
-      }
-    }
-
-    for (size_t i = 0; i < matB->getElementCnt(); i++) {
-      ASSERT_EQ(matB->getCols()[start + i], matA->getCols()[start + i]);
-    }
-    for (size_t i = 0; i < rowNum; i++) {
-      ASSERT_EQ(matB->getRows()[i], matA->getRows()[startRow + i]);
-    }
-  }
-}
-
-void sparseValid(
-    int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) {
-  CHECK_EQ(nnz, size_t(major[majorLen - 1]));
-  CHECK_EQ(nnz, minorLen);
-  for (size_t i = 0; i < majorLen - 1; i++) {
-    EXPECT_LE(major[i], major[i + 1]);
-    for (int j = major[i]; j < major[i + 1] - 1; j++) {
-      EXPECT_LE(minor[j], minor[j + 1]);
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixRandUniform) {
-  const size_t HEIGHT = 5;
-  const size_t WIDTH = 10;
-  const size_t NNZ = HEIGHT * WIDTH;
-  int* major = nullptr;
-  int* minor = nullptr;
-  size_t majorLen = 0;
-  size_t minorLen = 0;
-  size_t nnz = 0;
-  for (auto valueType : {NO_VALUE, FLOAT_VALUE}) {
-    for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
-      CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-          HEIGHT, WIDTH, size_t(NNZ * 0.1), valueType, format);
-      matA->randomizeUniform();
-      nnz = matA->getElementCnt();
-      if (format == SPARSE_CSR) {
-        majorLen = matA->getHeight() + 1;
-        minorLen = matA->getElementCnt();
-        major = matA->getRows();
-        minor = matA->getCols();
-      } else {
-        majorLen = matA->getWidth() + 1;
-        minorLen = matA->getElementCnt();
-        major = matA->getCols();
-        minor = matA->getRows();
-      }
-      sparseValid(major, minor, nnz, majorLen, minorLen);
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixCopyFrom) {
-  size_t height = 10;
-  size_t width = 8;
-  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 30, 32};
-  sparse_non_value_t data[32];
-  for (size_t i = 0; i < 32; i++) {
-    data[i].col = ::rand() % width;
-  }
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, NO_VALUE, SPARSE_CSR, false);
-  mat->copyFrom(indices, data);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getRows()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
-  }
-}
-
-TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
-  size_t height = 10;
-  size_t width = 8;
-  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
-  sparse_float_value_t data[32];
-  int value[32] = {
-      1,                       // row_0 : 1
-      5, 3, 1, 6,              // row_1 : 4
-      0, 1, 2, 3,              // row_3 : 4
-      4, 5, 6, 7,              // row_4 : 4
-      2, 3,                    // row_5 : 2
-      3, 5,                    // row_6 : 2
-      0, 1,                    // row_7 : 2
-      0, 1, 2, 3, 4, 5, 6, 7,  // row_8 : 8
-      2, 4, 7, 3, 1            // row_9 : 5
-  };
-  for (size_t i = 0; i < 32; i++) {
-    data[i].col = value[i];
-    data[i].value = float(value[i]);
-  }
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, FLOAT_VALUE, SPARSE_CSR, false);
-  mat->copyFrom(indices, data);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getRows()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
-  }
-
-  size_t trimedWidth = 4;
-  int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
-  sparse_float_value_t trimedData[19];
-  int trimedValue[19] = {
-      1,  // row_0 : 1
-      3,
-      1,  // row_1 : 2
-      0,
-      1,
-      2,
-      3,  // row_3 : 4
-      2,
-      3,  // row_5 : 2
-      3,  // row_6 : 1
-      0,
-      1,  // row_7 : 2
-      0,
-      1,
-      2,
-      3,  // row_8 : 4
-      2,
-      3,
-      1  // row_9 : 3
-  };
-  for (size_t i = 0; i < 19; i++) {
-    trimedData[i].col = trimedValue[i];
-    trimedData[i].value = float(trimedValue[i]);
-  }
-  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, 19, FLOAT_VALUE, SPARSE_CSR, false);
-  matA->copyFrom(trimedIndices, trimedData);
-
-  /*compare indices*/
-  sum = 0;
-  CHECK_EQ(sum, size_t(matA->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += trimedIndices[i] - trimedIndices[i - 1];
-    CHECK_EQ(sum, size_t(matA->getRows()[i]));
-  }
-  CHECK_EQ(matA->getElementCnt(),
-           size_t(trimedIndices[height] - trimedIndices[0]));
-  for (size_t i = 0; i < matA->getElementCnt(); i++) {
-    CHECK_EQ(size_t(matA->getCols()[i]), size_t(trimedData[i].col));
-  }
-
-  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, false);
-  matB->trimFrom(*mat);
-  checkSMatrixEqual2(matA, matB);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
-  matC->trimFrom(*mat);
-
-  CpuSparseMatrixPtr matD =
-      std::make_shared<CpuSparseMatrix>(height,
-                                        trimedWidth,
-                                        matC->getElementCnt(),
-                                        FLOAT_VALUE,
-                                        SPARSE_CSR,
-                                        false);
-  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  checkSMatrixEqual2(matA, matD);
-#endif
-}
-
-TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
-  size_t height = 8;
-  size_t width = 10;
-  int indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
-  int value[32] = {
-      1,                       // col_0 : 1
-      5, 3, 1, 6,              // col_1 : 4
-      0, 1, 2, 3,              // col_3 : 4
-      4, 5, 6, 7,              // col_4 : 4
-      2, 3,                    // col_5 : 2
-      3, 5,                    // col_6 : 2
-      0, 1,                    // col_7 : 2
-      0, 1, 2, 3, 4, 5, 6, 7,  // col_8 : 8
-      2, 4, 7, 3, 1            // col_9 : 5
-  };
-  std::vector<int> rows(value, value + 32);
-  std::vector<int> cols(indices, indices + 11);
-  std::vector<real> values(value, value + 32);
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, FLOAT_VALUE, SPARSE_CSC, false);
-  mat->copyFrom(rows, cols, values);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getCols()[0]));
-  for (size_t i = 1; i < width + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getCols()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[width] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getRows()[i]), size_t(value[i]));
-  }
-
-  size_t trimedWidth = 5;
-  int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
-  int trimedValue[13] = {
-      1,  // col_0 : 1
-      5,
-      3,
-      1,
-      6,  // col_1 : 4
-      0,
-      1,
-      2,
-      3,  // col_3 : 4
-      4,
-      5,
-      6,
-      7  // col_4 : 4
-  };
-  std::vector<int> rowsA(trimedValue, trimedValue + 13);
-  std::vector<int> colsA(trimedIndices, trimedIndices + 6);
-  std::vector<real> valuesA(trimedValue, trimedValue + 13);
-  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, 13, FLOAT_VALUE, SPARSE_CSC, false);
-  matA->copyFrom(rowsA, colsA, valuesA);
-
-  /*compare indices*/
-  sum = 0;
-  CHECK_EQ(sum, size_t(matA->getCols()[0]));
-  for (size_t i = 1; i < trimedWidth + 1; i++) {
-    sum += trimedIndices[i] - trimedIndices[i - 1];
-    CHECK_EQ(sum, size_t(matA->getCols()[i]));
-  }
-  CHECK_EQ(matA->getElementCnt(),
-           size_t(trimedIndices[trimedWidth] - trimedIndices[0]));
-  for (size_t i = 0; i < matA->getElementCnt(); i++) {
-    CHECK_EQ(size_t(matA->getRows()[i]), size_t(rowsA[i]));
-  }
-
-  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, false);
-  matB->trimFrom(*mat);
-  checkSMatrixEqual2(matA, matB);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
-  matC->trimFrom(*mat);
-
-  CpuSparseMatrixPtr matD =
-      std::make_shared<CpuSparseMatrix>(height,
-                                        trimedWidth,
-                                        matC->getElementCnt(),
-                                        FLOAT_VALUE,
-                                        SPARSE_CSC,
-                                        false);
-  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  checkSMatrixEqual2(matA, matD);
-#endif
-}
diff --git a/paddle/legacy/math/tests/test_Tensor.cu b/paddle/legacy/math/tests/test_Tensor.cu
deleted file mode 100644
index 3ce056d66..000000000
--- a/paddle/legacy/math/tests/test_Tensor.cu
+++ /dev/null
@@ -1,1162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/legacy/math/Matrix.h"
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuVector;
-using paddle::GpuVector;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-#define INIT_UNARY(A1, A2)  \
-  Tensor A1(height, width); \
-  Tensor A2(height, width); \
-  A1.randomizeUniform();    \
-  A2.copyFrom(A1)
-#define INIT_BINARY(A1, A2, B) \
-  INIT_UNARY(A1, A2);          \
-  Tensor B(height, width);     \
-  B.randomizeUniform()
-#define INIT_TERNARY(A1, A2, B, C) \
-  INIT_BINARY(A1, A2, B);          \
-  Tensor C(height, width);         \
-  C.randomizeUniform()
-#define INIT_QUATERNARY(A1, A2, B, C, D) \
-  INIT_TERNARY(A1, A2, B, C);            \
-  Tensor D(height, width);               \
-  D.randomizeUniform()
-
-template <typename Tensor>
-struct TestUnaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_UNARY(A1, A2);
-        testUnaryFunc(A1, A2);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestBinaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
-
-  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_BINARY(A1, A2, B);
-        testBinaryFunc(A1, A2, B);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestTernaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
-      TernaryFunc;
-
-  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_TERNARY(A1, A2, B, C);
-        testTernaryFunc(A1, A2, B, C);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestQuaternaryMatrix {
-  typedef std::function<void(
-      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
-      QuaternaryFunc;
-
-  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_QUATERNARY(A1, A2, B, C, D);
-        testQuaternaryFunc(A1, A2, B, C, D);
-      }
-    }
-  }
-};
-
-template <typename Tensor, class T>
-struct TestUnaryVectorT {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
-    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
-      LOG(INFO) << " size=" << size;
-      Tensor A1(size);
-      Tensor A2(size);
-      if (typeid(T) == typeid(real)) {
-        A1.rand();
-      } else {
-        A1.rand(1000);
-      }
-      A2.copyFrom(A1);
-      testUnaryFunc(A1, A2);
-    }
-  }
-};
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-    }
-  }
-}
-
-template <typename Tensor>
-void testTensorAddScalar(Tensor& A1, Tensor& A2) {
-  real p1 = 2.5;
-  real p2 = 3.0;
-  A1.add(p1);  // a += p
-  A2 += p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(p1, p2);  // a = a * p1 + p2
-  A2 = A2 * p1 + p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSubScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.subScalar(p);  // a -= p
-  A2 -= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMulScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.mulScalar(p);  // a *= p
-  A2 *= p;
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(learningRate, decayRate);
-  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDivScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.divScalar(p);  // a /= p
-  A2 /= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorNeg(Tensor& A1, Tensor& A2) {
-  A1.neg();  // a = -a
-  A2 = -A2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2) {
-  A1.abs2();  // a = a > 0 ? a : -a
-  A2 = A2.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2) {
-  A1.square2();  // a = a * a
-  A2 = A2.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2) {
-  A1.reciprocal2();  // a = 1.0f / a
-  A2 = A2.reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2) {
-  A1.sign2();  // a = (a > 0) - (a < 0)
-  A2 = A2.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2) {
-  A1.assign(1.5);  // a = p
-  A2 = A2.constant(1.5);
-  TensorCheckEqual(A1, A2);
-
-  A1.one();  // a = 1
-  A2 = A2.constant(1.0);
-  TensorCheckEqual(A1, A2);
-
-  A1.zero();  // a = 0
-  A2 = A2.constant(0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
-  testTensorAddScalar(A1, A2);
-  testTensorSubScalar(A1, A2);
-  testTensorMulScalar(A1, A2);
-  testTensorDivScalar(A1, A2);
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-  testTensorSquare(A1, A2);
-  testTensorReciprocal(A1, A2);
-  testTensorSign(A1, A2);
-  testTensorAssign(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
-  A1.add(2);  // a += p
-  A2 += 2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(3, 2);  // a = a * p1 + p2
-  A2 = A2 * 3 + 2;
-  TensorCheckEqual(A1, A2);
-
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-}
-
-TEST(Unary, BaseOp) {
-  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
-  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
-  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
-      testUnaryBaseOpInt<CpuIVector>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
-  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
-  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
-      testUnaryBaseOpInt<GpuIVector>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2) {
-  A1.exp2();  // a = exp(a)
-  A2 = A2.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2) {
-  A1.log2();  // a = log(a)
-  A2 = A2.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2) {
-  A1.sqrt2();  // a = sqrt(a)
-  A2 = A2.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2) {
-  A1.pow2(3.2);  // a = pow(a, p)
-  A2 = A2.pow(3.2);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrMathOp(Tensor& A1, Tensor& A2) {
-  testTensorExp(A1, A2);
-  testTensorLog(A1, A2);
-  testTensorSqrt(A1, A2);
-  testTensorPow(A1, A2);
-}
-
-TEST(Unary, MathOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorClip(Tensor& A1, Tensor& A2) {
-  real p1 = 0.003f;
-  real p2 = 0.877f;
-  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
-  // A2 = A2.min(0.877f).max(0.003f);
-  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
-  real p = 0.5f;
-  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
-  A2 = (A2 > p).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2) {
-  /**
-   * T lambda = p;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(learningRate, decayRate);
-  A2 = (A2 > (learningRate * decayRate))
-           .condition(
-               (A2 - (learningRate * decayRate)),
-               (A2 < -(learningRate * decayRate))
-                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
-  testTensorClip(A1, A2);
-  testTensorBiggerThanScalar(A1, A2);
-
-  A1.randomizeUniform();
-  A1.subScalar(0.5f);
-  A2.copyFrom(A1);
-  testTensorapplyL1(A1, A2);
-}
-
-TEST(Unary, CompareOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.2;
-  A1.add(B);  // a += b
-  A2 += B;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1);  // a += b * p
-  A2 += B * p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
-  A2 = A2 * p1 + B * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.addScalar(B, p1);  // a = b + p
-  A2 = B + p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.addSquare(B, p1);  // a += p * b * b
-  A2 += B.constant(p1) * B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
-  A2 = A2 * p1 + B.constant(p2) * B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.sub(B);  // a -= b
-  A2 -= B;
-  TensorCheckEqual(A1, A2);
-
-  A1.sub(B, p);  // a -= b * p
-  A2 -= B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.subScalar(B, p);  // a = b - p
-  A2 = B - p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.mulScalar(B, p);  // a = b * p
-  A2 = B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B);  // a *= b * b
-  A2 *= B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareMul(B);  // a = a * a * b
-  A2 = A2 * A2 * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMul(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.divScalar(B, p);  // a = b / p
-  A2 = B / p;
-  TensorCheckEqual(A1, A2);
-
-  A1.scalarDiv(B, p);  // a = p / b
-  A2 = B.constant(p) / B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.assign(B);  // a = b
-  A2 = B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.square2(A1);  // b = a * a
-  A2 = B.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.squareDerivative(B);  // a *= 2.0 * b
-  A2 = A2 * (real)2.0 * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.reciprocal2(A1);  // b = 1.0f / a
-  A2 = B.reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 0.58;
-  real p2 = 0.32;
-  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
-  A2 = (B * p1 + p2).reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
-  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
-            .reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reciprocalDerivative(B);  // a *= -b * b
-  A2 *= (-B) * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
-  A2 = B.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.abs2(A1);  // b = a > 0.0f ? a : -a
-  A2 = B.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorAdd(A1, A2, B);
-  testTensorSub(A1, A2, B);
-  testTensorMul(A1, A2, B);
-  testTensorDiv(A1, A2, B);
-  testTensorSquare(A1, A2, B);
-  testTensorSquareDerivative(A1, A2, B);
-  testTensorReciprocal(A1, A2, B);
-  testTensorReciprocalDerivative(A1, A2, B);
-  testTensorAbs(A1, A2, B);
-  testTensorSign(A1, A2, B);
-  testTensorAssign(A1, A2, B);
-}
-
-TEST(Binary, BaseOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = exp(b)
-  A1.exp2(B);
-  A2 = B.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.expDerivative(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = log(b)
-  A1.log2(B);
-  A2 = B.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = sqrt(b)
-  A1.sqrt2(B);
-  A2 = B.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = 1.0f / sqrt(b)
-  A1.invSqrt(B);
-  A2 = B.sqrt().reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.pow2(B, 2.5f);  // a = pow(b, p)
-  A2 = B.pow(2.5f);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * b = log(1.0 +
-   *         exp((a > THRESHOLD) ? THRESHOLD
-   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
-   */
-  B.softrelu(A1);
-
-  real THRESHOLD = 40.0;
-  A2 = (B.constant(1.0f) +
-        (B > THRESHOLD)
-            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
-            .exp())
-           .log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
-   *                             ? THRESHOLD
-   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-   */
-  A1.softreluDerivative(B);
-  real THRESHOLD = 40.0;
-  A2 = A2 *
-       (B.constant(1.0f) -
-        (B.constant(-1.0f) *
-         (B > THRESHOLD)
-             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
-            .exp());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-    const T THRESHOLD_MIN = -40.0;
-    const T THRESHOLD_MAX = 13.0;
-    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
-            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-    b = 1.0f / (1.0f + exp(-tmp)))
-   */
-  B.sigmoid(A1);
-
-  const real THRESHOLD_MIN = -40.0;
-  const real THRESHOLD_MAX = 13.0;
-  auto tmp = (B < THRESHOLD_MIN)
-                 .condition(THRESHOLD_MIN,
-                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
-  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
-  A2 *= B * (B.constant(1.0f) - B);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
-  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.tanhDerivative(B);  // a *= 1 - b * b
-  A2 *= B.constant(1.0f) - B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
-  B.scaledTanh(A1, p1, p2);
-  A2 = B.constant(p1) *
-       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
-        (real)1.0);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // a *= (p2 / p1) * (p1 * p1 - b * b));
-  A1.scaledTanhDerivative(B, p1, p2);
-  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorTanhDerivative(A1, A2, B);
-  testTensorScaledTanhDerivative(A1, A2, B);
-  testTensorSigmoidDerivative(A1, A2, B);
-  testTensorExpDerivative(A1, A2, B);
-  testTensorScaledTanh(A1, A2, B);
-  testTensorTanh(A1, A2, B);
-  testTensorExp(A1, A2, B);
-  testTensorLog(A1, A2, B);
-  testTensorSqrt(A1, A2, B);
-  testTensorInvSqrt(A1, A2, B);
-  testTensorPow(A1, A2, B);
-
-  testTensorSoftrelu(A1, A2, B);
-  testTensorSoftreluDerivative(A1, A2, B);
-  testTensorSigmoid(A1, A2, B);
-}
-
-TEST(Binary, MathOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
-  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
-  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * b = a > p1 ? a : p1
-   * b = b < p2 ? b : p2
-   * int p1 = 0, p2 = 24;
-   */
-  SetTensorValue(B, 32.0f);
-  B.brelu(A1);
-  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
-  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  SetTensorValue(B, 32.0f);
-  /*
-   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
-   * int p1 = 0, p2 = 24;
-   */
-  A1.breluDerivative(B);
-  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
-  A2 = (B > (real)0.0f)
-           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 0.613;
-  SetTensorValue(B, p);
-  A1.isEqualTo(B, p);  // a = (b == p)
-  A2 = (B == p);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
-  /**
-   * T lambda = p * b;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(B, learningRate, decayRate);
-  auto lambda = B.constant(learningRate * decayRate) * B;
-  A2 = (A2 > lambda)
-           .condition((A2 - lambda),
-                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.subScalar(0.5f);
-  SetTensorValue(B, 0.0f);
-  testTensorReluDerivative(A1, A2, B);
-
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  testTensorBreluDerivative(A1, A2, B);
-
-  testTensorAbsDerivative(A1, A2, B);
-  testTensorRelu(A1, A2, B);
-  testTensorBrelu(A1, A2, B);
-  testTensorIsEqualTo(A1, A2, B);
-}
-
-TEST(Binary, CompareOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.add(B, C);  // a = b + c
-  A2 = B + C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.8;
-  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
-  A2 = B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C);  // a = a + b + c
-  A2 = A2 + B + C;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
-  A2 = A2 * p1 + B * p2 + C * p3;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
-  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.sub(B, C);  // a = b - c
-  A2 = B - C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
-  A2 = B * p1 - C * p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotMul(B, C);  // a = b * c
-  A2 = B * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B, C);  // a = b * c * c
-  A2 = B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareSquare(B, C);  // a = b * b * c * c
-  A2 = B * B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a *= tmp * tmp
-   */
-  A1.dotMulSquareSum(B, C, p1, p2);
-  auto tmp = B * p1 + C * p2;
-  A2 *= tmp * tmp;
-  TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a = tmp * tmp
-   */
-  A1.dotSquareSum(B, C, p1, p2);
-  auto tmp2 = B * p1 + C * p2;
-  A2 = tmp2 * tmp2;
-  TensorCheckEqual(A1, A2);
-
-  // a *= p1 * b + p2 * c
-  A1.dotMulSum(B, C, p1, p2);
-  A2 *= B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  // a = p1 * a + p2 * b * c
-  A1.addDotMul(B, C, p1, p2);
-  A2 = A2 * p1 + B.constant(p2) * B * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
-  A2 = (B == (real)0.0).condition((real)0.0, B / C);
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
-  A2 = (B + p1) / (C + p2);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.5;
-  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
-  A2 = (B * p1 + C * p2 + p3).reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
-  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropyBp(Tensor& A1,
-                                  Tensor& A2,
-                                  Tensor& B,
-                                  Tensor& C) {
-  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
-  A2 += (B - C) / (B * (B.constant(1.0f) - B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorAdd(A1, A2, B, C);
-  testTensorSub(A1, A2, B, C);
-  testTensorMul(A1, A2, B, C);
-  testTensorDiv(A1, A2, B, C);
-  testTensorReciprocal(A1, A2, B, C);
-  testTensorSoftCrossEntropyBp(A1, A2, B, C);
-
-  testTensorSoftCrossEntropy(A1, A2, B, C);
-}
-
-TEST(Ternary, BaseOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropy(Tensor& A1,
-                                       Tensor& A2,
-                                       Tensor& B,
-                                       Tensor& C) {
-  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
-  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
-                                         Tensor& A2,
-                                         Tensor& B,
-                                         Tensor& C) {
-  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
-  A1.binaryLabelCrossEntropyBp(B, C);
-  A2 += (C > (real)0.5)
-            .condition((B.constant(-1.0f) / B),
-                       (B.constant(1.0f) - B).reciprocal());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLoss(Tensor& A1,
-                                      Tensor& A2,
-                                      Tensor& B,
-                                      Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * a = log(1 + exp(x)) - c * x
-   */
-  A1.logisticRegressionLoss(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLossBp(Tensor& A1,
-                                        Tensor& A2,
-                                        Tensor& B,
-                                        Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * x = exp(x); a = x / (1 + x) - c
-   */
-  A1.logisticRegressionLossBp(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  auto tmp2 = tmp.exp();
-  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
-  A2 = (B > C).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.max2(B, C);  // a = (b > c) ? b : c
-  A2 = (B > C).condition(B, C);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
-  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
-  testTensorBiggerThan(A1, A2, B, C);
-  testTensorMax(A1, A2, B, C);
-
-  testTensorLogisticRegressionLoss(A1, A2, B, C);
-  testTensorLogisticRegressionLossBp(A1, A2, B, C);
-}
-
-TEST(Ternary, CompareOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testQuaternaryAdd(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
-  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
-  // TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c + p3 * d;
-   * a += tmp * tmp
-   */
-  real p1 = 1.5f;
-  real p2 = 2.5f;
-  real p3 = 3.5f;
-  A1.addSquareSum(B, C, D, p1, p2, p3);
-  auto tmp = B * p1 + C * p2 + D * p3;
-  A2 += tmp * tmp;
-  TensorCheckEqual(A1, A2);
-}
-
-TEST(Quaternary, BaseOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-  A1.biggerThan(B, C, D);
-  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
-           .condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLoss(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = log(1 + exp(a)) - a * d
-   */
-  A1.rankLoss(B, C, D);
-
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLossBp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = exp(a); a = (a / (1 + a) - d)
-   */
-  A1.rankLossBp(B, C, D);
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  auto tmp3 = tmp2.exp();
-  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testQuaternaryCompareOp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  testTensorBiggerThan(A1, A2, B, C, D);
-  testTensorRankLoss(A1, A2, B, C, D);
-  testTensorRankLossBp(A1, A2, B, C, D);
-}
-
-TEST(Quaternary, CompareOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
-#endif
-}
diff --git a/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
deleted file mode 100644
index 214ae8971..000000000
--- a/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
+++ /dev/null
@@ -1,461 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "OriginalOptimizerApi.h"
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/legacy/math/TrainingAlgorithmOp.h"
-#include "paddle/legacy/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-
-#ifndef PADDLE_TYPE_DOUBLE
-DEFINE_double(max_diff, 1e-5, "max diff allowed");
-#else
-DEFINE_double(max_diff, 1e-13, "max diff allowed");
-#endif
-
-class SetMaxDiff {
- public:
-  explicit SetMaxDiff(double max_diff) {
-    max_diff_ = FLAGS_max_diff;
-    FLAGS_max_diff = max_diff;
-  }
-  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
-
- private:
-  double max_diff_;
-};
-
-#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
-  do {                                                   \
-    if (vector->useGpu()) {                              \
-      cpuVec = Vector::create(vector->getSize(), false); \
-      cpuVec->copyFrom(*vector);                         \
-    } else {                                             \
-      cpuVec = vector;                                   \
-    }                                                    \
-  } while (0)
-
-int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (fabs(a - b) > FLAGS_max_diff) {
-      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
-        count++;
-      }
-    }
-  }
-
-  return count;
-}
-
-int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
-  VectorPtr tmp1;
-  VectorPtr tmp2;
-  COPY_VECTOR_TO_CPU(tmp1, vector1);
-  COPY_VECTOR_TO_CPU(tmp2, vector2);
-  return VectorCheckErr(*tmp1, *tmp2);
-}
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define CHECK_VECTORPTR(vector1, vector2) \
-  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
-
-#else
-
-#define CHECK_VECTORPTR(vector1, vector2)
-
-#endif
-
-typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
-
-void testCase(testMatrixFunc matrixFunc) {
-#ifdef PADDLE_WITH_CUDA
-  for (auto useGpu : {false, true}) {
-#else
-  for (auto useGpu : {false}) {
-#endif
-    for (auto size : {1,
-                      32,
-                      64,
-                      128,
-                      512,
-                      1024,
-                      4096,
-                      32768,
-                      65536,
-                      131072,
-                      262144,
-                      524288,
-                      1048576,
-                      2097152}) {
-      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
-      matrixFunc(size, useGpu);
-    }
-  }
-}
-
-#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
-  vec1[type] = Vector::create(size, useGpu);        \
-  vec2[type] = Vector::create(size, useGpu);        \
-  vec1[type]->rand();                               \
-  vec2[type]->copyFrom(*vec1[type]);
-
-void testAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
-      bufs1, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adagradApply(value,
-                                      grad,
-                                      mom,
-                                      accum_buffer,
-                                      accum,
-                                      lr,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, Adagrad) { testCase(testAdagrad); }
-
-void testAdaDelta(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
-      bufs1, rou, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adadeltaApply(value,
-                                       grad,
-                                       mom,
-                                       accum,
-                                       accum_update,
-                                       lr,
-                                       rou,
-                                       epsilon,
-                                       learningRate,
-                                       momentum,
-                                       decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, AdaDelta) { testCase(testAdaDelta); }
-
-template <bool isFirstTime>
-void testRMSProp(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  /* make sure 'g - f.square()' greater than 0 */
-  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
-  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
-      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
-                                                   accumulatedRou,
-                                                   rou,
-                                                   epsilon,
-                                                   learningRate,
-                                                   momentum,
-                                                   decayRate,
-                                                   isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(rmspropApply(value,
-                                      grad,
-                                      mom,
-                                      sum,
-                                      sum1,
-                                      lr,
-                                      accumulatedRou,
-                                      rou,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate,
-                                      isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, RMSProp) {
-  testCase(testRMSProp<true>);
-  testCase(testRMSProp<false>);
-}
-
-template <bool isFirstTime>
-void testDecayedAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  if (isFirstTime) {
-    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-  }
-
-  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
-                                                          accumulatedRou,
-                                                          rou,
-                                                          epsilon,
-                                                          learningRate,
-                                                          momentum,
-                                                          decayRate,
-                                                          isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
-                                             grad,
-                                             mom,
-                                             sum,
-                                             lr,
-                                             accumulatedRou,
-                                             rou,
-                                             epsilon,
-                                             learningRate,
-                                             momentum,
-                                             decayRate,
-                                             isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, DecayedAdagrad) {
-  testCase(testDecayedAdagrad<false>);
-  testCase(testDecayedAdagrad<true>);
-}
-
-void testAdam(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
-      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
-
-  EXPRESSION_PERFORMANCE(adamApply(value,
-                                   grad,
-                                   mom,
-                                   v,
-                                   beta1,
-                                   beta2,
-                                   beta1_power,
-                                   beta2_power,
-                                   epsilon,
-                                   learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
-                  bufs2[PARAMETER_SECOND_MOMENTUM]);
-}
-
-TEST(Training, Adam) { testCase(testAdam); }
-
-void testAdamax(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
-  int64_t step = 2;
-
-  EXPRESSION_PERFORMANCE(
-      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  EXPRESSION_PERFORMANCE(
-      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
-                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
-}
-
-TEST(Training, Adamax) {
-#ifndef PADDLE_TYPE_DOUBLE
-  SetMaxDiff diff(1e-4);
-#endif
-  testCase(testAdamax);
-}
-
-void testSparseMomentum(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
-
-  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
-  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
-      bufs1, alpha, beta, gamma, tau, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
-  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
-
-  EXPRESSION_PERFORMANCE(sparseMomentumApply(
-      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
-}
-
-TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
diff --git a/paddle/legacy/math/tests/test_batchTranspose.cpp b/paddle/legacy/math/tests/test_batchTranspose.cpp
deleted file mode 100644
index ccfd6d5aa..000000000
--- a/paddle/legacy/math/tests/test_batchTranspose.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_batch_transpose.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-#ifdef PADDLE_WITH_CUDA
-TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
-  const int nx = 100;
-  const int ny = 50;
-  const int numSamples = 50;
-
-  MatrixPtr cMat = Matrix::create(numSamples, nx * ny, false, false);
-  MatrixPtr gMat = Matrix::create(numSamples, nx * ny, false, true);
-
-  MatrixPtr cBatchTransMat = Matrix::create(numSamples, nx * ny, false, false);
-  MatrixPtr gBatchTransMat = Matrix::create(numSamples, nx * ny, false, true);
-  MatrixPtr cMat_d2h = Matrix::create(numSamples, nx * ny, false, false);
-
-  real* cData = cMat->getData();
-  real* gold = cBatchTransMat->getData();
-
-  // host
-  for (int sample_id = 0; sample_id < numSamples; ++sample_id)
-    for (int j = 0; j < ny; j++)
-      for (int i = 0; i < nx; i++)
-        cData[sample_id * nx * ny + j * nx + i] = j * nx + i;
-
-  // correct result for error checking
-  for (int sample_id = 0; sample_id < numSamples; ++sample_id)
-    for (int j = 0; j < ny; j++)
-      for (int i = 0; i < nx; i++)
-        gold[sample_id * nx * ny + i * ny + j] =
-            cData[sample_id * nx * ny + j * nx + i];
-  // device
-  gMat->copyFrom(*cMat, HPPL_STREAM_DEFAULT);
-  batchTranspose(
-      gMat->getData(), gBatchTransMat->getData(), nx, ny, numSamples);
-  cMat_d2h->copyFrom(*gBatchTransMat, HPPL_STREAM_DEFAULT);
-  checkMatrixEqual(cBatchTransMat, cMat_d2h);
-}
-#endif
diff --git a/paddle/legacy/math/tests/test_lazyAssign.cu b/paddle/legacy/math/tests/test_lazyAssign.cu
deleted file mode 100644
index cf8c3d771..000000000
--- a/paddle/legacy/math/tests/test_lazyAssign.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/TensorAssign.h"
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-typedef std::function<void(int height, int width)> testMatrixFunc;
-void testMatrixCase(testMatrixFunc matrixFunc) {
-  for (auto height : {1}) {
-    for (auto width : {1,
-                       32,
-                       64,
-                       128,
-                       512,
-                       1024,
-                       4096,
-                       32768,
-                       65536,
-                       131072,
-                       262144,
-                       524288,
-                       1048576,
-                       2097152,
-                       4194304,
-                       8388608}) {
-      matrixFunc(height, width);
-    }
-  }
-}
-
-template <typename Tensor>
-void testLazyAssign(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor B(height, width);
-  Tensor C(height, width);
-  Tensor D(height, width);
-  A1.randomizeUniform();
-  B.randomizeUniform();
-  C.randomizeUniform();
-  D.randomizeUniform();
-  A2.copyFrom(A1);
-
-  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
-
-  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
-                         auto expr2 = A2.lazyAssign(A2 * D);
-                         AssignEvaluate(expr1, expr2););
-
-  TensorCheckErr(A1, A2);
-}
-
-TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
-#endif
-
-template <typename Tensor>
-void sgdUpdateTensor(
-    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
-  C = C * p2 - D * (B + A * p3) * p1;
-  A += C;
-}
-
-void sgdUpdateLazyAssign(BaseMatrix& A,
-                         BaseMatrix& B,
-                         BaseMatrix& C,
-                         BaseMatrix& D,
-                         real p1,
-                         real p2,
-                         real p3) {
-  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
-  auto expr2 = A.lazyAssign(A + C);
-  AssignEvaluate(expr1, expr2);
-}
-
-template <typename Tensor>
-void testSgdUpdate(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor A3(height, width);
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  A3.copyFrom(A1);
-
-  Tensor B(height, width);
-  B.randomizeUniform();
-
-  Tensor C1(height, width);
-  Tensor C2(height, width);
-  Tensor C3(height, width);
-  C1.randomizeUniform();
-  C2.copyFrom(C1);
-  C3.copyFrom(C1);
-
-  Tensor D(height, width);
-  D.randomizeUniform();
-
-  real p1 = 0.2;
-  real p2 = 0.3;
-  real p3 = 0.5;
-
-  /**
-   * c = p2 * c - p1 * (b + p3 * a);
-   * a = a + c;
-   */
-  // BaseMatrix API
-  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
-
-  // Tensor expression
-  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
-
-  // lazyAssign
-  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
-
-  TensorCheckErr(A1, A2);
-  TensorCheckErr(A1, A3);
-  TensorCheckErr(C1, C2);
-  TensorCheckErr(C1, C3);
-}
-
-TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
-#endif
diff --git a/paddle/legacy/math/tests/test_matrixCompare.cpp b/paddle/legacy/math/tests/test_matrixCompare.cpp
deleted file mode 100644
index a43adde46..000000000
--- a/paddle/legacy/math/tests/test_matrixCompare.cpp
+++ /dev/null
@@ -1,1698 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-#include "paddle/legacy/utils/DynamicLoader.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-void testMatrixMaxSequence(int batchSize, int inputDim) {
-  // forward
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  IVectorPtr cpuIndex = nullptr;
-  IVectorPtr gpuIndex = nullptr;
-  IVector::resizeOrCreate(cpuIndex, newBatchSize * inputDim, false);
-  IVector::resizeOrCreate(gpuIndex, newBatchSize * inputDim, true);
-  cpuIndex->zeroMem();
-  gpuIndex->zeroMem();
-
-  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
-  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-  TensorCheckEqual(*cpuIndex, *gpuIndex);
-
-  // backward
-  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutputGrad = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-
-  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
-  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
-}
-
-TEST(Matrix, maxSequence) {
-  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
-    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testMatrixMaxSequence(batchSize, inputDim);
-    }
-  }
-}
-
-void testMatrixGetSum(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  int x = log10(height * width);
-  real err = 1e-6 * pow(10, x);
-#else
-  real err = 1e-8;
-#endif
-
-  real cpuSum = cpuInput->getSum();
-  real gpuSum = gpuInput->getSum();
-
-  EXPECT_LE(fabs(cpuSum - gpuSum), err);
-}
-
-void testMatrixGetMinMax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  real cpuMin = cpuInput->getMin();
-  real gpuMin = gpuInput->getMin();
-  real cpuMax = cpuInput->getMax();
-  real gpuMax = gpuInput->getMax();
-
-  EXPECT_EQ(cpuMin, gpuMin);
-  EXPECT_EQ(cpuMax, gpuMax);
-}
-
-void testMatrixZeroAtOffset(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuTest = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuTest->copyFrom(*cpuA);
-
-  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
-  int numColumns = rand() % (width - columnOffset);  // NOLINT
-
-  if (numColumns == 0) return;
-
-  cpuA->zeroAtOffset(columnOffset, numColumns);
-  gpuA->zeroAtOffset(columnOffset, numColumns);
-
-  /* cpuTest */
-  real* a = cpuTest->getData() + columnOffset;
-  for (int64_t i = 0; i < height; ++i) {
-    for (int64_t j = 0; j < numColumns; ++j) {
-      a[i * width + j] = 0;
-    }
-  }
-
-  TensorCheckEqual(*cpuA, *gpuA);
-  TensorCheckEqual(*cpuA, *cpuTest);
-}
-
-void testMatrixDeepSwap(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuCopyA->copyFrom(*cpuA);
-  cpuCopyB->copyFrom(*cpuB);
-
-  // swap matrix cpuA and cpuB
-  cpuA->deepSwap(*cpuB);
-
-  TensorCheckEqual(*cpuA, *cpuCopyB);
-  TensorCheckEqual(*cpuB, *cpuCopyA);
-}
-
-void testMatrixTranspose(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuT = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuT = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-  cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, true);
-
-  TensorCheckEqual(*cpuT, *gpuT);
-}
-
-void testMatrixRotate(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-
-  cpu->rotate(cpuR, false, true);
-  gpu->rotate(gpuR, true, true);
-  TensorCheckEqual(*cpuR, *gpuR);
-
-  cpu->rotate(cpuR, true, false);
-  gpu->rotate(gpuR, false, false);
-  TensorCheckEqual(*cpuR, *gpuR);
-}
-
-void testMatrixInverse(int height) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
-  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
-
-  /* Make matrix well conditioned: cpu * cpuT + Identity */
-  cpu->randomizeUniform();
-  MatrixPtr cpuT = cpu->getTranspose();
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
-  outputCheck->mul(*cpu, *cpuT);
-  cpu->setDiag(1.0);
-  cpu->add(*outputCheck);
-
-  gpu->copyFrom(*cpu);
-  cpu->inverse(cpuI, true);
-  gpu->inverse(gpuI, false);
-
-  TensorCheckErr(*cpuI, *gpuI);
-
-  outputCheck->mul(*cpu, *cpuI);
-  cpu->setDiag(1.0);
-  TensorCheckErr(*cpu, *outputCheck);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixDeepSwap(height, width);
-      testMatrixZeroAtOffset(height, width);
-      testMatrixGetSum(height, width);
-      testMatrixTranspose(height, width);
-      testMatrixRotate(height, width);
-    }
-#ifdef LAPACK_FOUND
-    // inverse matrix
-    testMatrixInverse(height);
-#else
-    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
-                 << "support so we cannot test matrix inverse. To test "
-                 << "matrix inverse, please install LAPACKE "
-                 << "and MKL/Openblas, and re-build PaddlePaddle.";
-#endif
-  }
-}
-
-void testMatrixSoftmax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-void testSequenceSoftmax(int batchSize) {
-  // forward
-  int inputDim = 1;
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
-  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
-
-  TensorCheckErr(*cpuInput, *gpuInput);
-}
-
-void testMatrixSoftmaxThreshold(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  cpuInput->getData()[0] = 100.0;
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  // check output zero
-  int cpuCount = 0;
-  int gpuCount = 0;
-  auto zeroNum = [](MatrixPtr out, int& count) {
-    for (size_t i = 0; i < out->getHeight(); i++) {
-      for (size_t j = 0; j < out->getWidth(); j++) {
-        if (out->getElement(i, j) == 0) count++;
-      }
-    }
-  };
-  zeroNum(cpuOutput, cpuCount);
-  zeroNum(outputCheck, gpuCount);
-  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
-  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
-}
-
-void testMatrixSoftmaxBp(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuOutput->softmaxBackward(*gpuInput);
-
-  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
-  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
-  sftMaxSum->colMerge(*sftMaxDot);
-  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-TEST(Matrix, softmax) {
-  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
-    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixSoftmax(height, width);
-      testMatrixSoftmaxBp(height, width);
-      testMatrixSoftmaxThreshold(height, width);
-    }
-    testSequenceSoftmax(height);
-  }
-}
-
-void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
-  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
-  cpuTable->randomizeUniform();
-  gpuTable->copyFrom(*cpuTable);
-
-  IVectorPtr cpuIds;
-  IVectorPtr gpuIds;
-  cpuIds = VectorT<int>::create(numSamples, false);
-  gpuIds = VectorT<int>::create(numSamples, true);
-  cpuIds->rand(tableSize);
-  gpuIds->copyFrom(*cpuIds);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuOutput->addToRows(*cpuTable, *cpuIds);
-  gpuOutput->addToRows(*gpuTable, *gpuIds);
-
-  TensorCheckErr(*cpuTable, *gpuTable);
-}
-
-TEST(Matrix, tableProjection) {
-  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
-    for (auto tableSize : {10, 100}) {
-      for (auto inputDim : {20, 50}) {
-        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                << " inputDim=" << inputDim;
-        testMatrixAddToRows(numSamples, tableSize, inputDim);
-      }
-    }
-  }
-}
-
-void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuC->mul(*cpuA, *cpuB, alpha, beta);
-  gpuC->mul(*gpuA, *gpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  auto subSize = [](int& start, int& end, int dim) {
-    if (dim == 1) {
-      start = 0;
-      end = dim;
-    } else {
-      int subDim = rand() % (dim - 1) + 1;  // NOLINT
-      start = rand() % (dim - subDim);      // NOLINT
-      end = start + subDim;
-    }
-  };
-
-  auto subMatrix = [](MatrixPtr& sub,
-                      MatrixPtr matrix,
-                      size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol) {
-    if (!matrix->isTransposed()) {
-      sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
-    } else {
-      sub = matrix->subMatrix(startCol, endCol, startRow, endRow);
-    }
-  };
-
-  int startM, endM;
-  int startN, endN;
-  int startK, endK;
-  subSize(startM, endM, dimM);
-  subSize(startN, endN, dimN);
-  subSize(startK, endK, dimK);
-
-  MatrixPtr subCpuA;
-  MatrixPtr subCpuB;
-  MatrixPtr subGpuA;
-  MatrixPtr subGpuB;
-  subMatrix(subCpuA, cpuA, startM, endM, startK, endK);
-  subMatrix(subGpuA, gpuA, startM, endM, startK, endK);
-  subMatrix(subCpuB, cpuB, startK, endK, startN, endN);
-  subMatrix(subGpuB, gpuB, startK, endK, startN, endN);
-  MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
-  MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
-
-  subCpuC->mul(*subCpuA, *subCpuB, alpha, beta);
-  subGpuC->mul(*subGpuA, *subGpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-TEST(Matrix, mul) {
-  for (auto transa : {false, true}) {
-    for (auto transb : {false, true}) {
-      for (auto dimM : {1, 9, 53, 127, 345, 1023, 2135}) {
-        for (auto dimN : {1, 5, 37, 256, 1024}) {
-          for (auto dimK : {8, 45, 346, 784, 1025}) {
-            if (true == transa && true == transb) {
-              continue;
-            }
-            VLOG(3) << setiosflags(ios::left) << setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
-                    << dimN << " dimK=" << setw(5) << dimK;
-
-            testMatrixMul(transa, transb, dimM, dimN, dimK);
-            testSubMatrixMul(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-void testVectorRowFunc(int size) {
-  CpuVectorPtr cpu = std::make_shared<CpuVectorT<real>>(size);
-  GpuVectorPtr gpu = std::make_shared<GpuVectorT<real>>(size);
-
-  cpu->rand();
-  gpu->copyFrom(*cpu);
-
-  EXPECT_EQ(cpu->getMax(), gpu->getMax());
-  EXPECT_EQ(cpu->getMin(), gpu->getMin());
-  EXPECT_EQ(cpu->getAbsMax(), gpu->getAbsMax());
-}
-
-TEST(Vector, rowFunc) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorRowFunc(size);
-  }
-}
-
-template <class T>
-void testVectorReset(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpu->reset(value);
-  gpu->reset(value);
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVecortSelectFrom(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuSrc =
-      std::make_shared<CpuVectorT<T>>(size * 2);
-  std::shared_ptr<GpuVectorT<T>> gpuSrc =
-      std::make_shared<GpuVectorT<T>>(size * 2);
-  CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
-  GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuSrc->rand();
-  } else {
-    cpuSrc->rand(100000);
-  }
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuIds->rand(size);
-  gpuIds->copyFrom(*cpuIds);
-
-  cpuDst->selectFrom(*cpuSrc, *cpuIds);
-  gpuDst->selectFrom(*gpuSrc, *gpuIds);
-
-  TensorCheckEqual(*cpuDst, *gpuDst);
-}
-
-template <class T>
-void testVecotrZeroMem(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  cpu->zeroMem();
-  gpu->zeroMem();
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVectorIsEqual(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuA = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuB = std::make_shared<GpuVectorT<T>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuB->rand();
-  } else {
-    cpuB->rand(100000);
-  }
-  gpuB->copyFrom(*cpuB);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpuA->isEqualTo(*cpuB, value);
-  gpuA->isEqualTo(*gpuB, value);
-
-  TensorCheckEqual(*cpuA, *gpuA);
-}
-
-TEST(Vector, Equal) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorReset<int>(size);
-    testVectorReset<real>(size);
-    testVecortSelectFrom<int>(size);
-    testVecortSelectFrom<real>(size);
-    testVecotrZeroMem<int>(size);
-    testVecotrZeroMem<real>(size);
-    testVectorIsEqual<int>(size);
-    testVectorIsEqual<real>(size);
-  }
-}
-
-void testMatrixTopK(int samples, int dim, int beamSize) {
-  MatrixPtr cpuSrc = std::make_shared<CpuMatrix>(samples, dim);
-  MatrixPtr gpuSrc = std::make_shared<GpuMatrix>(samples, dim);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-}
-
-TEST(Matrix, topK) {
-  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
-    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
-      for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
-        if (beamSize > dim) continue;
-        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                << " dim=" << dim;
-        testMatrixTopK(samples, dim, beamSize);
-      }
-    }
-  }
-}
-
-void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
-  int nnz = samples * dim * ratio;
-  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
-  MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuVal->zero();
-  cpuIds->zero();
-  gpuVal->zero();
-  gpuIds->zero();
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-
-  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
-  outCheckIds->copyFrom(*gpuIds);
-
-  const int* data1 = cpuIds->getData();
-  const int* data2 = outCheckIds->getData();
-  size_t size = cpuIds->getSize();
-  for (size_t i = 0; i < size; i++) {
-    if (data1[i] == -1 && data1[i] != data2[i]) {
-      EXPECT_EQ(data1[i], data2[i]);
-    }
-  }
-}
-
-TEST(SMatrix, topK) {
-  for (auto samples : {1, 3, 61}) {
-    for (auto dim : {1, 3, 61}) {
-      for (auto beamSize : {1, 3, 61}) {
-        for (auto ratio : {0.01, 0.001}) {
-          if (beamSize > dim) continue;
-          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                  << " dim=" << dim << " ratio=" << ratio;
-          testSMatrixTopK(samples, dim, beamSize, ratio);
-        }
-      }
-    }
-  }
-}
-
-void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
-  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-
-  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInGrad->randomizeUniform();
-  gpuInGrad->copyFrom(*cpuInGrad);
-
-  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
-  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuInGrad, *gpuInGrad);
-}
-
-TEST(Matrix, sequenceAvg) {
-  for (auto batchSize : {10, 128, 6000}) {
-    for (auto inputDim : {32, 100, 512}) {
-      for (auto mode : {0, 1, 2}) {
-        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
-                << " mode=" << mode;
-        testMatrixSequenceAvg(batchSize, inputDim, mode);
-      }
-    }
-  }
-}
-
-void testParamReluBackwardDiff(int height,
-                               int width,
-                               int w_height,
-                               int w_width) {
-  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr diff = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  oGrad->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  diff->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr diffGpu = CpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  oGradGpu->copyFrom(*oGrad);
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-  diffGpu->copyFrom(*diff);
-
-  diff->paramReluBackwardDiff(*oGrad, *input, *w);
-  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
-
-  TensorCheckErr(*diff, *diffGpu);
-}
-
-TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 40, 100}) {
-    for (auto width : {10, 40, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          if (width % (w_height * w_width)) continue;
-          testParamReluBackwardDiff(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testClassificationError(int numSamples, int dim, int topkSize) {
-  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
-  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
-  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
-  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
-
-  cpuOutput->randomizeUniform();
-  cpuLabel->rand(dim);
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuLabel->copyFrom(*cpuLabel);
-
-  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
-  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
-
-  TensorCheckEqual(*cpuError, *gpuError);
-}
-
-TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 3, 31}) {
-    for (auto dim : {1, 3, 31}) {
-      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
-        if (topkSize > dim) continue;
-        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
-                << " dim= " << dim;
-        testClassificationError(numSamples, dim, topkSize);
-      }
-    }
-  }
-}
-
-void testMaxPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->maxPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPoolBackward(*input,
-                             imgSizeH,
-                             imgSizeW,
-                             *targetGrad,
-                             *target,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->maxPoolBackward(*inputGpu,
-                                imgSizeH,
-                                imgSizeW,
-                                *targetGpuGrad,
-                                *targetGpu,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->avgPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPoolBackward(*targetGrad,
-                             imgSizeH,
-                             imgSizeW,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
-                                imgSizeH,
-                                imgSizeW,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, PoolFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {1, 3}) {
-      for (auto imgSizeH : {13, 17}) {
-        for (auto imgSizeW : {17, 19}) {
-          for (auto sizeX : {2, 3}) {
-            for (auto sizeY : {2, 3}) {
-              for (auto sH : {1, 2}) {
-                for (auto sW : {1, 2}) {
-                  for (auto pH : {0, (sizeY - 1) / 2}) {
-                    for (auto pW : {0, (sizeX - 1) / 2}) {
-                      VLOG(3) << " numSamples=" << numSamples
-                              << " channels=" << channels
-                              << " imgSizeH=" << imgSizeH
-                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
-                              << " sizeY=" << sizeY << " strideH=" << sH
-                              << " strideW=" << sW << " padingH=" << pH
-                              << " padingW=" << pW;
-                      testMaxPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                      testAvgPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void testMaxOutFwdBwd(
-    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outChannels = channels / groups;
-  int outWidth = imgSizeH * imgSizeW * outChannels;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
-  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  target->maxoutForward(*input, *id, outChannels, groups);
-  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*target, *targetGpu);
-  TensorCheckEqual(*id, *idGpu);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
-  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-TEST(Matrix, MaxOutFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          for (auto groups : {2, 4}) {
-            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
-                    << " groups=" << groups;
-            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(CpuMatrix, copyFrom) {
-  const size_t height = 31;
-  const size_t width = 53;
-  CpuMatrix cpu(height, width);
-  GpuMatrix gpu(height, width);
-  CpuMatrix copy(height, width);
-
-  cpu.randomizeUniform();
-  gpu.copyFrom(cpu);
-  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
-
-  TensorCheckEqual(cpu, copy);
-}
-
-void testBatch2seqPadding(int batchSize, int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
-    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
-  }
-
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  size_t numSeq = cpuSequence->getSize() - 1;
-  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
-                                       cpuSequence->getData() + numSeq);
-
-  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
-  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-
-  // hl_sequence2batch_copy_padding(gBatch->getData(),
-  //                                gpuInput->getData(),
-  //                                cpuSequence->getData(),
-  //                                inputDim,
-  //                                maxSeqLen,
-  //                                numSeq,
-  //                                false,
-  //                                true);
-  // cCheck->copyFrom(*gBatch);
-
-  // int* seqStart = cpuSequence->getData();
-  // float* batchData = cBatch->getData();
-  // float* seqData = cpuInput->getData();
-  // for (size_t i = 0; i < maxSeqLen; i++) {
-  //   for (size_t j = 0; j < numSeq; j++) {
-  //     size_t sequenceStart = seqStart[j];
-  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
-  //     if (i < sequenceLength) {
-  //       memcpy(batchData + (i * numSeq + j) * inputDim,
-  //              seqData + (sequenceStart + i) * inputDim,
-  //              inputDim * sizeof(real));
-  //     } else {
-  //       memset(batchData + (i * numSeq + j) * inputDim,
-  //              0,
-  //              inputDim * sizeof(real));
-  //     }
-  //   }
-  // }
-
-  // TensorCheckErr(*cBatch, *cCheck);
-}
-
-TEST(Matrix, warpCTC) {
-  for (auto batchSize : {1, 3, 17}) {
-    for (auto inputDim : {1, 3, 31}) {
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testBatch2seqPadding(batchSize, inputDim);
-    }
-  }
-}
-
-void testMaxPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPool3DForward(*input,
-                           *maxIdx,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-  targetGpu->maxPool3DForward(*inputGpu,
-                              *maxIdxGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPool3DBackward(*targetGrad,
-                               *maxIdx,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
-                                  *maxIdxGpu,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPool3DForward(*input,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-
-  targetGpu->avgPool3DForward(*inputGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPool3DBackward(*targetGrad,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-
-  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, Pool3DFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {3}) {
-      for (auto imgSizeD : {9, 16}) {
-        for (auto imgSizeH : {9, 32}) {
-          for (auto imgSizeW : {9, 32}) {
-            for (auto sizeX : {3}) {
-              for (auto sizeY : {3}) {
-                for (auto sizeZ : {3}) {
-                  for (auto sD : {2}) {
-                    for (auto sH : {2}) {
-                      for (auto sW : {2}) {
-                        for (auto pD : {0, (sizeZ - 1) / 2}) {
-                          for (auto pH : {0, (sizeY - 1) / 2}) {
-                            for (auto pW : {0, (sizeX - 1) / 2}) {
-                              VLOG(3) << " numSamples=" << numSamples
-                                      << " channels=" << channels
-                                      << " imgSizeD=" << imgSizeD
-                                      << " imgSizeH=" << imgSizeH
-                                      << " imgSizeW=" << imgSizeW
-                                      << " sizeX=" << sizeX
-                                      << " sizeY=" << sizeY
-                                      << " sizeZ=" << sizeZ << " strideD=" << sD
-                                      << " strideH=" << sH << " strideW=" << sW
-                                      << " padingD=" << pD << " padingH=" << pH
-                                      << " padingW=" << pW;
-
-                              testMaxPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                              testAvgPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  //  for (auto numSamples : {1, 3}) {
-  //    for (auto channels : {1, 3}) {
-  //      for (auto imgSizeD : {9,16}) {
-  //      for (auto imgSizeH : {9, 32}) {
-  //        for (auto imgSizeW : {9, 32}) {
-  //          for (auto sizeX : {2, 3}) {
-  //            for (auto sizeY : {2, 3}) {
-  //            for (auto sizeZ : {2,3}){
-  //              for (auto sD : {1, 2}) {
-  //              for (auto sH : {1, 2}) {
-  //                for (auto sW : {1, 2}) {
-  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
-  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
-  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
-  //                      VLOG(3) << " numSamples=" << numSamples
-  //                              << " channels=" << channels
-  //                              << " imgSizeD=" << imgSizeD
-  //                              << " imgSizeH=" << imgSizeH
-  //                              << " imgSizeW=" << imgSizeW
-  //                              << " sizeX=" << sizeX
-  //                              << " sizeY=" << sizeY
-  //                              << " sizeZ=" << sizeZ
-  //                              << " strideD=" << sD
-  //                              << " strideH=" << sH
-  //                              << " strideW=" << sW
-  //                              << " padingD=" << pD
-  //                              << " padingH=" << pH
-  //                              << " padingW=" << pW;
-  //
-  //                      testMaxPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                      testAvgPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                    }
-  //                  }
-  //                }
-  //              }
-  //            }
-  //            }
-  //          }
-  //        }
-  //      }
-  //      }
-  //    }
-  //    }
-  //  }
-  //  }
-}
-
-void testMatrixCol2Vol(int depth, int height, int width) {
-  int channel = 3;
-  int filterX = 3, filterY = 4, filterZ = 5;
-  int strideX = 2, strideY = 2, strideZ = 2;
-  int padX = 1, padY = 1, padZ = 1;
-
-  MatrixPtr cpuImage =
-      std::make_shared<CpuMatrix>(channel, depth * height * width);
-  MatrixPtr gpuImage =
-      std::make_shared<GpuMatrix>(channel, depth * height * width);
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-
-  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
-  int outH = outputSize(height, filterY, padY, strideY, true);
-  int outW = outputSize(width, filterX, padX, strideX, true);
-
-  int colBufHeight = channel * filterZ * filterY * filterX;
-  int colBufWidth = outD * outH * outW;
-  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
-  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
-  cpuColBuf->vol2Col(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  gpuColBuf->vol2Col(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
-
-  cpuColBuf->randomizeUniform();
-  gpuColBuf->copyFrom(*cpuColBuf);
-  cpuColBuf->col2Vol(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  gpuColBuf->col2Vol(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  TensorCheckErr(*cpuImage, *gpuImage);
-}
-
-TEST(Matrix, col2Vol) {
-  for (auto depth : {9, 16, 64}) {
-    for (auto height : {9, 11, 128}) {
-      for (auto width : {9, 32, 128}) {
-        VLOG(3) << "depth=" << depth << " height=" << height
-                << " width=" << width;
-        testMatrixCol2Vol(depth, height, width);
-      }
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_matrixUtil.h b/paddle/legacy/math/tests/test_matrixUtil.h
deleted file mode 100644
index 58c93f746..000000000
--- a/paddle/legacy/math/tests/test_matrixUtil.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/Util.h>
-#include "paddle/legacy/math/SparseMatrix.h"
-
-namespace paddle {
-
-void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  for (size_t r = 0; r < a->getHeight(); ++r) {
-    for (size_t c = 0; c < a->getWidth(); ++c) {
-      ASSERT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
-    }
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-  ASSERT_EQ(a.getFormat(), b.getFormat());
-  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
-  for (size_t r = 0; r < a.getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
-                       const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  for (size_t r = 0; r < a->getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
-                        const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-
-  if (a.getFormat() == SPARSE_CSC) {
-    int* rows = a.getRows();
-    for (size_t i = 0; i < a.getWidth(); i++) {
-      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a.getCols();
-    for (size_t i = 0; i < a.getHeight(); i++) {
-      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
-                             const CpuMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-
-  if (a->getFormat() == SPARSE_CSC) {
-    int* rows = a->getRows();
-    for (size_t i = 0; i < a->getWidth(); i++) {
-      for (size_t j = a->getColStartIdx(i); j < a->getColStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a->getCols();
-    for (size_t i = 0; i < a->getHeight(); i++) {
-      for (size_t j = a->getRowStartIdx(i); j < a->getRowStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  int count = 0;
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            LOG(INFO) << "a=" << aVal << "\t"
-                      << "b=" << bVal;
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (std::abs(a - b) > err) {
-        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkDataEqual(const real* a, const real* b, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    ASSERT_FLOAT_EQ(a[i], b[i]);
-  }
-}
-
-}  //  namespace paddle
diff --git a/paddle/legacy/math/tests/test_perturbation.cpp b/paddle/legacy/math/tests/test_perturbation.cpp
deleted file mode 100644
index 969400666..000000000
--- a/paddle/legacy/math/tests/test_perturbation.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <vector>
-#include "hl_cuda.h"
-#include "hl_perturbation_util.cuh"
-
-using namespace std;  // NOLINT
-
-#define _USE_MATH_DEFINES
-
-const int NUM_IMAGES = 2;
-const int SAMPLING_RATE = 2;
-const int IMG_SIZE = 41;
-const int TGT_SIZE = 21;
-const int CHANNELS = 3;
-
-class PerturbationTest : public testing::Test {
- protected:
-  virtual void SetUp() { generateTestImages(gpuImages_); }
-
-  virtual void TearDown() {}
-
-  void allocateMem(real*& gpuAngle,
-                   real*& gpuScale,
-                   int*& gpuCenterR,
-                   int*& gpuCenterC) {
-    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    gpuCenterR =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    gpuCenterC =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-  }
-
-  // Generate translation parameters for testing.
-  void generateTranslationParams(int*& gpuCenterR,
-                                 int*& gpuCenterC,
-                                 int imgSize) {
-    int cpuCenterR[NUM_IMAGES * SAMPLING_RATE];
-    int cpuCenterC[NUM_IMAGES * SAMPLING_RATE];
-    for (int i = 0; i < NUM_IMAGES * SAMPLING_RATE; ++i) {
-      cpuCenterR[i] = (imgSize - 1) / 2;
-      cpuCenterC[i] = (imgSize - 1) / 2 - 1;
-    }
-
-    gpuCenterR =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(
-        gpuCenterR, cpuCenterR, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-
-    gpuCenterC =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(
-        gpuCenterC, cpuCenterC, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-  }
-
-  // Generate rotation parameters for testing.
-  void generateRotationParams(real*& gpuAngle) {
-    real cpuAngle[NUM_IMAGES];
-    for (int i = 0; i < NUM_IMAGES; ++i) {
-      cpuAngle[i] = 90.0 * M_PI / 180.0;
-    }
-    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    hl_memcpy_host2device(gpuAngle, cpuAngle, sizeof(real) * NUM_IMAGES);
-  }
-
-  void generateScaleParams(real*& gpuScale) {
-    real cpuScale[NUM_IMAGES];
-    for (int i = 0; i < NUM_IMAGES; ++i) {
-      cpuScale[i] = static_cast<real>(TGT_SIZE - 2) / TGT_SIZE;
-    }
-    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    hl_memcpy_host2device(gpuScale, cpuScale, sizeof(real) * NUM_IMAGES);
-  }
-
-  // Generate the test images, only the center regions are set to 1.
-  // The other parts are set to 0.
-  void generateTestImages(real*& gpuImages) {
-    const int IMAGE_MEM_SIZE = NUM_IMAGES * IMG_SIZE * IMG_SIZE * CHANNELS;
-    real cpuImages[IMAGE_MEM_SIZE];
-    // Set the middle of each image to 1.
-    real* ptr = cpuImages;
-    for (int i = 0; i < NUM_IMAGES; ++i) {
-      for (int r = 0; r < IMG_SIZE; ++r) {
-        for (int c = 0; c < IMG_SIZE; ++c) {
-          for (int ch = 0; ch < CHANNELS; ++ch) {
-            if (r >= IMG_SIZE / 4 && r < IMG_SIZE - IMG_SIZE / 4 &&
-                c >= IMG_SIZE / 4 && c < IMG_SIZE - IMG_SIZE / 4) {
-              *ptr = 1.0;
-            } else {
-              *ptr = 0.0;
-            }
-            ++ptr;
-          }
-        }
-      }
-    }
-    gpuImages = (real*)hl_malloc_device(sizeof(real) * IMAGE_MEM_SIZE);
-    hl_memcpy_host2device(gpuImages, cpuImages, sizeof(real) * IMAGE_MEM_SIZE);
-  }
-
-  real* gpuImages_;
-};
-
-// Random perturbation. Only to make sure the code does not break.
-TEST_F(PerturbationTest, random_perturb) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_,
-                         IMG_SIZE,
-                         TGT_SIZE,
-                         CHANNELS,
-                         NUM_IMAGES,
-                         1.0,
-                         1.0,
-                         SAMPLING_RATE,
-                         gpuAngle,
-                         gpuScaleRatio,
-                         gpuCenterR,
-                         gpuCenterC,
-                         2,
-                         true,
-                         targets);
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-}
-
-TEST_F(PerturbationTest, identity_perturb) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_,
-                         IMG_SIZE,
-                         TGT_SIZE,
-                         CHANNELS,
-                         NUM_IMAGES,
-                         1.0,
-                         1.0,
-                         SAMPLING_RATE,
-                         gpuAngle,
-                         gpuScaleRatio,
-                         gpuCenterR,
-                         gpuCenterC,
-                         2,
-                         false,
-                         targets);
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
-    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
-  }
-}
-
-TEST_F(PerturbationTest, translation_test) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle,
-                             gpuScaleRatio,
-                             gpuCenterR,
-                             gpuCenterC,
-                             NUM_IMAGES,
-                             IMG_SIZE,
-                             0.0,
-                             0.0,
-                             SAMPLING_RATE,
-                             false);
-  generateTranslationParams(gpuCenterR, gpuCenterC, IMG_SIZE);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(gpuImages_,
-                                     IMG_SIZE,
-                                     TGT_SIZE,
-                                     CHANNELS,
-                                     NUM_IMAGES,
-                                     SAMPLING_RATE,
-                                     gpuAngle,
-                                     gpuScaleRatio,
-                                     gpuCenterR,
-                                     gpuCenterC,
-                                     2,
-                                     targets);
-
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
-    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
-      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
-      if (p < TGT_SIZE * CHANNELS) {
-        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
-      } else {
-        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
-      }
-    }
-  }
-}
-
-TEST_F(PerturbationTest, rotation_test) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle,
-                             gpuScaleRatio,
-                             gpuCenterR,
-                             gpuCenterC,
-                             NUM_IMAGES,
-                             IMG_SIZE,
-                             0.0,
-                             0.0,
-                             SAMPLING_RATE,
-                             false);
-  generateRotationParams(gpuAngle);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(gpuImages_,
-                                     IMG_SIZE,
-                                     TGT_SIZE,
-                                     CHANNELS,
-                                     NUM_IMAGES,
-                                     SAMPLING_RATE,
-                                     gpuAngle,
-                                     gpuScaleRatio,
-                                     gpuCenterR,
-                                     gpuCenterC,
-                                     2,
-                                     targets);
-
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
-    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
-  }
-}
-
-TEST_F(PerturbationTest, scale_test) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle,
-                             gpuScaleRatio,
-                             gpuCenterR,
-                             gpuCenterC,
-                             NUM_IMAGES,
-                             IMG_SIZE,
-                             0.0,
-                             0.0,
-                             SAMPLING_RATE,
-                             false);
-  generateScaleParams(gpuScaleRatio);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(gpuImages_,
-                                     IMG_SIZE,
-                                     TGT_SIZE,
-                                     CHANNELS,
-                                     NUM_IMAGES,
-                                     SAMPLING_RATE,
-                                     gpuAngle,
-                                     gpuScaleRatio,
-                                     gpuCenterR,
-                                     gpuCenterC,
-                                     2,
-                                     targets);
-
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
-    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
-      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
-      int c = (p / CHANNELS) % TGT_SIZE;
-      int r = (p / CHANNELS) / TGT_SIZE;
-      if (r == 0 || r == TGT_SIZE - 1 || c == 0 || c == TGT_SIZE - 1) {
-        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
-      } else {
-        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
-      }
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
deleted file mode 100644
index 492aa0a68..000000000
--- a/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
-//  so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
-
-void testSpMatrixAddBias(int M, int N, real rate, real scale) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_1);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->addBias(*cpuB, scale);
-  gpuA->addBias(*gpuB, scale);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixAddDense(int M, int N, real rate) {  // add3
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(M, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(M, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->add3(cpuB);
-  gpuA->add3(gpuB);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixMul(int M, int N, int K, real rate) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
-  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
-
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
-  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  gpuC->copyFrom(*cpuC, stream);
-  hl_stream_synchronize(stream);
-
-  cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1);
-  gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuC, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixErr(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuC),
-                  std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixCollectBias(int M, int N, real rate) {
-  int nnz = M * N * rate;
-  LOG(INFO) << "nnz=" << nnz;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuB->collectBias(*cpuA, 1);
-  gpuB->collectBias(*gpuA, 1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, N);
-  outputCheck->copyFrom(*gpuB, stream);
-  hl_stream_synchronize(stream);
-  checkMatrixErr(*cpuB, *outputCheck);
-}
-
-TEST(SMatrix, sMatrixOp) {
-  for (auto height : {1, 11, 200}) {
-    for (auto width : {200, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      for (auto rate : {0.02, 0.1}) {
-        testSpMatrixAddDense(height, width, rate);
-        testSpMatrixAddBias(height, width, rate, 1.0);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixMul) {
-  for (auto M : {1, 40, 128, 200}) {
-    for (auto N : {100, 2000, 20480}) {
-      for (auto K : {100, 512, 1024}) {
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
-        testSpMatrixMul(M, N, K, 0.05);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixCollectBias) {
-  for (auto height : {1, 128, 200}) {
-    for (auto width : {100, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testSpMatrixCollectBias(height, width, 0.1);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/legacy/optimizer/CMakeLists.txt b/paddle/legacy/optimizer/CMakeLists.txt
deleted file mode 100644
index 7c80faa48..000000000
--- a/paddle/legacy/optimizer/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-set(OPITMIZER_SRCS
-    adadelta_optimizer.cc
-    adagrad_optimizer.cc
-    adam_optimizer.cc
-    optimizer.cc
-    parameter_optimizer.cc
-    sgd_optimizer.cc
-  )
-
-add_library(paddle_optimizer ${OPITMIZER_SRCS})
-target_link_libraries(paddle_optimizer paddle_proto glog)
-
-if (WITH_TESTING)
-    add_unittest(serialization_test serialization_test.cc)
-    add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
-endif()
diff --git a/paddle/legacy/optimizer/adadelta_optimizer.cc b/paddle/legacy/optimizer/adadelta_optimizer.cc
deleted file mode 100644
index 1faeb0cd3..000000000
--- a/paddle/legacy/optimizer/adadelta_optimizer.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "adadelta_optimizer.h"
-#include <algorithm>
-#include <cmath>
-
-namespace paddle {
-namespace optimizer {
-
-void AdadeltaOptimizer::Update(const Tensor* gradient) {
-  num_sample_passed_ += 1;
-  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
-  Tensor& param = *parameter_;
-  const Tensor& grad = *gradient;
-  Tensor& accum_g = *accum_gradient_;
-  Tensor& accum_d = *accum_delta_;
-  Tensor& update_d = *update_delta_;
-  for (size_t i = 0; i < param.size(); ++i) {
-    accum_g[i] = rho_ * accum_g[i] + (1.0 - rho_) * grad[i] * grad[i];
-
-    update_d[i] = std::sqrt(accum_d[i] + epsilon_) /
-                  std::sqrt(accum_g[i] + epsilon_) * grad[i];
-
-    accum_d[i] = rho_ * accum_d[i] + (1.0 - rho_) * update_d[i] * update_d[i];
-
-    param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i];
-  }
-}
-
-std::string AdadeltaOptimizer::SerializeState() {
-  AdadeltaOptimizerState state;
-  state.set_num_sample_passed(num_sample_passed_);
-  std::string lr_str = this->lr_policy_->SerializeState();
-  state.mutable_lr_state()->ParseFromString(lr_str);
-
-  TensorToProto(*parameter_, state.mutable_parameter());
-  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
-  TensorToProto(*accum_delta_, state.mutable_accum_delta());
-  TensorToProto(*update_delta_, state.mutable_update_delta());
-  return state.SerializeAsString();
-}
-
-void AdadeltaOptimizer::DeserializeState(const std::string& str) {
-  AdadeltaOptimizerState state;
-  state.ParseFromString(str);
-  auto lr_state = state.lr_state();
-  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
-  num_sample_passed_ = state.num_sample_passed();
-
-  ProtoToTensor(state.parameter(), parameter_);
-  ProtoToTensor(state.accum_gradient(), accum_gradient_);
-  ProtoToTensor(state.accum_delta(), accum_delta_);
-  ProtoToTensor(state.update_delta(), update_delta_);
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adadelta_optimizer.h b/paddle/legacy/optimizer/adadelta_optimizer.h
deleted file mode 100644
index 5beb62295..000000000
--- a/paddle/legacy/optimizer/adadelta_optimizer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class AdadeltaOptimizer : public ParameterOptimizer {
- public:
-  AdadeltaOptimizer(
-      Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
-      : ParameterOptimizer(parameter, lr),
-        accum_gradient_(new Tensor(parameter->size())),
-        accum_delta_(new Tensor(parameter->size())),
-        update_delta_(new Tensor(parameter->size())),
-        rho_(rho),
-        epsilon_(epsilon),
-        decay_(decay) {}
-
-  ~AdadeltaOptimizer() {
-    if (accum_gradient_) delete accum_gradient_;
-    if (accum_delta_) delete accum_delta_;
-    if (update_delta_) delete update_delta_;
-  }
-  void Update(const Tensor *gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string &state);
-
- private:
-  Tensor *accum_gradient_;
-  Tensor *accum_delta_;
-  Tensor *update_delta_;
-  double rho_;
-  double epsilon_;
-  double decay_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adagrad_optimizer.cc b/paddle/legacy/optimizer/adagrad_optimizer.cc
deleted file mode 100644
index 5ac65dbd7..000000000
--- a/paddle/legacy/optimizer/adagrad_optimizer.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-
-#include "adagrad_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-void AdagradOptimizer::Update(const Tensor* gradient) {
-  num_sample_passed_ += 1;
-  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
-  Tensor& param = *parameter_;
-  Tensor& accum_g = *accum_gradient_;
-  const Tensor& grad = *gradient;
-  for (size_t i = 0; i < param.size(); ++i) {
-    accum_g[i] += grad[i] * grad[i];
-    param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon_) +
-                learning_rate * decay_ * param[i];
-  }
-}
-std::string AdagradOptimizer::SerializeState() {
-  AdagradOptimizerState state;
-  state.set_num_sample_passed(num_sample_passed_);
-  std::string lr_str = this->lr_policy_->SerializeState();
-  state.mutable_lr_state()->ParseFromString(lr_str);
-
-  TensorToProto(*parameter_, state.mutable_parameter());
-  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
-  return state.SerializeAsString();
-}
-
-void AdagradOptimizer::DeserializeState(const std::string& str) {
-  AdagradOptimizerState state;
-  state.ParseFromString(str);
-  auto lr_state = state.lr_state();
-  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
-
-  num_sample_passed_ = state.num_sample_passed();
-  ProtoToTensor(state.parameter(), parameter_);
-  ProtoToTensor(state.accum_gradient(), accum_gradient_);
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adagrad_optimizer.h b/paddle/legacy/optimizer/adagrad_optimizer.h
deleted file mode 100644
index b6fc06739..000000000
--- a/paddle/legacy/optimizer/adagrad_optimizer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class AdagradOptimizer : public ParameterOptimizer {
- public:
-  AdagradOptimizer(Tensor *parameter,
-                   LrPolicy *lr,
-                   double epsilon,
-                   double decay)
-      : ParameterOptimizer(parameter, lr),
-        accum_gradient_(new Tensor(parameter->size())),
-        epsilon_(epsilon),
-        decay_(decay) {}
-  ~AdagradOptimizer() {
-    if (accum_gradient_) delete accum_gradient_;
-  }
-  void Update(const Tensor *gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string &state);
-
- private:
-  Tensor *accum_gradient_;
-  double epsilon_;
-  double decay_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adam_optimizer.cc b/paddle/legacy/optimizer/adam_optimizer.cc
deleted file mode 100644
index 9a4ff5ecc..000000000
--- a/paddle/legacy/optimizer/adam_optimizer.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "adam_optimizer.h"
-#include <cmath>
-
-namespace paddle {
-namespace optimizer {
-
-void AdamOptimizer::Update(const Tensor *gradient) {
-  num_sample_passed_ += 1;
-  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
-  double coef1 = 1.0 - std::pow(beta_1_, num_sample_passed_);
-  double coef2 = 1.0 - std::pow(beta_2_, num_sample_passed_);
-  learning_rate *= std::sqrt(coef2) / coef1;
-  Tensor &param = *parameter_;
-  const Tensor &grad = *gradient;
-  Tensor &m = *momentums_;
-  Tensor &v = *velocitys_;
-  for (size_t i = 0; i < param.size(); ++i) {
-    m[i] = beta_1_ * m[i] + (1.0 - beta_1_) * grad[i];
-    v[i] = beta_2_ * v[i] + (1.0 - beta_2_) * grad[i] * grad[i];
-    param[i] -=
-        learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]);
-  }
-}
-
-std::string AdamOptimizer::SerializeState() {
-  AdamOptimizerState state;
-  std::string lr_str = this->lr_policy_->SerializeState();
-  state.mutable_lr_state()->ParseFromString(lr_str);
-  state.set_num_sample_passed(num_sample_passed_);
-
-  TensorToProto(*parameter_, state.mutable_parameter());
-  TensorToProto(*momentums_, state.mutable_momentums());
-  TensorToProto(*velocitys_, state.mutable_velocitys());
-  return state.SerializeAsString();
-}
-
-void AdamOptimizer::DeserializeState(const std::string &str) {
-  AdamOptimizerState state;
-  state.ParseFromString(str);
-  auto lr_state = state.lr_state();
-  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
-  num_sample_passed_ = state.num_sample_passed();
-
-  ProtoToTensor(state.parameter(), parameter_);
-  ProtoToTensor(state.momentums(), momentums_);
-  ProtoToTensor(state.velocitys(), velocitys_);
-}
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/adam_optimizer.h b/paddle/legacy/optimizer/adam_optimizer.h
deleted file mode 100644
index fce109600..000000000
--- a/paddle/legacy/optimizer/adam_optimizer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class AdamOptimizer : public ParameterOptimizer {
- public:
-  AdamOptimizer(Tensor *parameter,
-                LrPolicy *lr,
-                double beta_1,
-                double beta_2,
-                double epsilon,
-                double decay)
-      : ParameterOptimizer(parameter, lr),
-        momentums_(new Tensor(parameter->size())),
-        velocitys_(new Tensor(parameter->size())),
-        beta_1_(beta_1),
-        beta_2_(beta_2),
-        epsilon_(epsilon),
-        decay_(decay) {}
-  ~AdamOptimizer() {
-    if (momentums_) delete momentums_;
-    if (velocitys_) delete velocitys_;
-  }
-  void Update(const Tensor *gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string &state);
-
- private:
-  Tensor *momentums_;
-  Tensor *velocitys_;
-  double beta_1_;
-  double beta_2_;
-  double epsilon_;
-  double decay_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/lr_policy.h b/paddle/legacy/optimizer/lr_policy.h
deleted file mode 100644
index d639c9f22..000000000
--- a/paddle/legacy/optimizer/lr_policy.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <algorithm>
-#include "OptimizerConfig.pb.h"
-
-namespace paddle {
-namespace optimizer {
-
-class LrPolicy {
- public:
-  virtual ~LrPolicy() {}
-  virtual double LearningRate(const uint64_t num_sample_passed) = 0;
-  virtual std::string SerializeState() = 0;
-  virtual void DeserializeState(const std::string &state) = 0;
-};
-
-// constant learning rate policy
-class ConstLr final : public LrPolicy {
- public:
-  ConstLr(double lr) : learning_rate_(lr){};
-  double LearningRate(const uint64_t num_sample_passed) {
-    return learning_rate_;
-  }
-  std::string SerializeState() {
-    LrPolicyState state;
-    state.set_learning_rate(learning_rate_);
-    return state.SerializeAsString();
-  }
-  void DeserializeState(const std::string &str) {
-    LrPolicyState state;
-    state.ParseFromString(str);
-    learning_rate_ = state.learning_rate();
-  }
-
- private:
-  double learning_rate_;
-};
-
-class LinearLr final : public LrPolicy {
- public:
-  LinearLr(double lr, double lr_decay_a, double lr_decay_b)
-      : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
-  double LearningRate(const uint64_t num_sample_passed) {
-    return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed,
-                    lr_decay_b_);
-  }
-  std::string SerializeState() {
-    LrPolicyState state;
-    state.set_learning_rate(learning_rate_);
-    state.set_lr_decay_a(lr_decay_a_);
-    state.set_lr_decay_b(lr_decay_b_);
-    return state.SerializeAsString();
-  }
-  void DeserializeState(const std::string &str) {
-    LrPolicyState state;
-    state.ParseFromString(str);
-    learning_rate_ = state.learning_rate();
-    lr_decay_a_ = state.lr_decay_a();
-    lr_decay_b_ = state.lr_decay_b();
-  }
-
- private:
-  double learning_rate_;
-  double lr_decay_a_;
-  double lr_decay_b_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/optimizer.cc b/paddle/legacy/optimizer/optimizer.cc
deleted file mode 100644
index e583aebd7..000000000
--- a/paddle/legacy/optimizer/optimizer.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "optimizer.h"
-#include <glog/logging.h>
-#include <cstdlib>
-#include <cstring>
-#include <string>
-
-#include "parameter_optimizer.h"
-
-using paddle::optimizer::ParameterOptimizer;
-using paddle::optimizer::Tensor;
-
-template <paddle_element_type VALUE>
-struct EnumToType {};
-
-template <class T>
-struct TypeToEnum {};
-
-#define MATCH_ENUM_TYPE(TYPE, ENUM)                 \
-  template <>                                       \
-  struct TypeToEnum<TYPE> {                         \
-    static paddle_element_type v() { return ENUM; } \
-    static constexpr TYPE value = ENUM;             \
-  };                                                \
-  template <>                                       \
-  struct EnumToType<ENUM> {                         \
-    typedef TYPE Type;                              \
-  }
-
-MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
-MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
-MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
-MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
-MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
-MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);
-
-struct paddle_optimizer {
-  paddle::optimizer::ParameterOptimizer* impl;
-};
-
-paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
-                                          const int config_proto_len,
-                                          const paddle_element_type data_type,
-                                          void* param_buffer,
-                                          int num_bytes,
-                                          const char* state,
-                                          const int state_len) {
-  paddle_optimizer* optimizer = new paddle_optimizer;
-  std::string config(config_proto, config_proto + config_proto_len);
-  Tensor* parameter = new Tensor(reinterpret_cast<float*>(param_buffer),
-                                 num_bytes / sizeof(float));
-  optimizer->impl = ParameterOptimizer::Create(config, parameter);
-  if (state != nullptr) {
-    std::string s(state, state + state_len);
-    optimizer->impl->DeserializeState(s);
-  }
-  return optimizer;
-}
-
-int paddle_release_optimizer(paddle_optimizer* o) {
-  if (o != nullptr) delete o->impl;
-  return PADDLE_SUCCESS;
-}
-
-int paddle_update_parameter(paddle_optimizer* o,
-                            const paddle_element_type data_type,
-                            const void* grad_buffer,
-                            int num_bytes) {
-  // TOOD(zhihong): datatype not work. need to add the runtime datatype
-  auto grad_type = reinterpret_cast<const float*>(grad_buffer);
-  Tensor* gradient =
-      new Tensor(const_cast<float*>(grad_type), num_bytes / sizeof(float));
-  o->impl->Update(gradient);
-  return PADDLE_SUCCESS;
-}
-
-int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer) {
-  int param_size = 0;
-  *param_buffer = (void*)o->impl->get_weight(&param_size);
-  return param_size;
-}
-
-int paddle_optimizer_get_state(paddle_optimizer* o, const char** state) {
-  std::string s = o->impl->SerializeState();
-  int state_len = s.size();
-
-  if (state_len > 0) {
-    *state = (char*)std::malloc(state_len);
-    std::memcpy((void*)*state, (const void*)s.c_str(), state_len);
-  }
-
-  return state_len;
-}
diff --git a/paddle/legacy/optimizer/optimizer.h b/paddle/legacy/optimizer/optimizer.h
deleted file mode 100644
index c079de921..000000000
--- a/paddle/legacy/optimizer/optimizer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdbool.h>
-#include <stdint.h>
-
-/**
- * @brief optimizer library in independent with other module
- * which will be used in :
- * Case A, the gradient optimized locally on the trainer.
- *
- * Case B, the gradient optimized on the parameter server.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef enum {
-  PADDLE_ELEMENT_TYPE_INT32 = 0,
-  PADDLE_ELEMENT_TYPE_UINT32 = 1,
-  PADDLE_ELEMENT_TYPE_INT64 = 2,
-  PADDLE_ELEMENT_TYPE_UINT64 = 3,
-  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
-  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
-} paddle_element_type;
-
-/**
- * @brief execution status code
- */
-const int32_t PADDLE_SUCCESS = 0;
-const int32_t PADDLE_ERROR = -1;
-
-typedef struct paddle_optimizer paddle_optimizer;
-/**
- * this group interface called in order :
- * 1. create optimizer with config
- * 2. set weights
- * 3. update_parameter
- * 4. get_weights
- * 5. release optimizer
- */
-
-/**
- *  @brief create optimizer with proto_config
- *  @param config_proto, optimizer protobuf, see OptimizerConfig.proto in detail
- *  @return return optimizer instance
- */
-paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
-                                          const int config_proto_len,
-                                          const paddle_element_type data_type,
-                                          void* param_buffer,
-                                          int num_bytes,
-                                          const char* state,
-                                          const int state_len);
-
-/**
- *  @brief release optimizer
- *  @param optimizer
- *  @return return exec status
- */
-int paddle_release_optimizer(paddle_optimizer* o);
-
-/**
- *  @brief optimizer instance
- *  @param datatype of gradient and parameter
- *  @param gradient, calculate by optimzizer caller.
- *       TODO(zhihong): just pass loss to reduce communicate overhead.
- *                     Project Adam Ms'14 paper for detail
- *  @param num_bytes, gradient size
- *  @return return exec status
- */
-int paddle_update_parameter(paddle_optimizer* o,
-                            const paddle_element_type data_type,
-                            const void* gradient,
-                            int num_bytes);
-
-/**
- *  @brief optimizer for get parameter buffer
- *  @param param_buffer, initilized parameter buffer
- *  @return return content length
- */
-int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer);
-
-/**
- *  @brief optimzizer for saving training state
- *  @param training state for receive SerializeState
- *  @return return state_buffer length
- */
-int paddle_optimizer_get_state(paddle_optimizer* o, const char** state);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/paddle/legacy/optimizer/parameter_optimizer.cc b/paddle/legacy/optimizer/parameter_optimizer.cc
deleted file mode 100644
index f9474b315..000000000
--- a/paddle/legacy/optimizer/parameter_optimizer.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "adadelta_optimizer.h"
-#include "adagrad_optimizer.h"
-#include "adam_optimizer.h"
-#include "lr_policy.h"
-#include "sgd_optimizer.h"
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto,
-                                               Tensor *parameter) {
-  paddle::OptimizerConfig config;
-  CHECK(config.ParseFromString(config_proto) == true)
-      << "failed parse optimizer config";
-  auto select_lr_policy = [=](const OptimizerConfig &config) -> LrPolicy * {
-    if (config.lr_policy() == OptimizerConfig::Const)
-      return new ConstLr(config.const_lr().learning_rate());
-    if (config.lr_policy() == OptimizerConfig::Linear)
-      return new LinearLr(config.linear_lr().learning_rate(),
-                          config.linear_lr().lr_decay_a(),
-                          config.linear_lr().lr_decay_b());
-    // default
-    LOG(WARNING) << " have not select any LrPolicy. use ConstLr in default";
-    return new ConstLr(0.1);
-  };
-
-  LrPolicy *lr = select_lr_policy(config);
-  auto select_optimizer = [=](
-      Tensor *parameter,
-      const OptimizerConfig &config) -> ParameterOptimizer * {
-    if (config.optimizer() == OptimizerConfig::SGD) {
-      LOG(INFO) << "creating SGD optimizer";
-      return new SGDOptimizer(parameter,
-                              lr,
-                              config.sgd().momentum(),
-                              config.sgd().decay(),
-                              config.sgd().nesterov());
-    }
-    if (config.optimizer() == OptimizerConfig::Adadelta) {
-      LOG(INFO) << "creating Adadelta optimizer";
-      return new AdadeltaOptimizer(parameter,
-                                   lr,
-                                   config.adadelta().rho(),
-                                   config.adadelta().epsilon(),
-                                   config.adadelta().decay());
-    }
-    if (config.optimizer() == OptimizerConfig::Adagrad) {
-      LOG(INFO) << "creating Adagrad optimizer";
-      return new AdagradOptimizer(
-          parameter, lr, config.adagrad().epsilon(), config.adagrad().decay());
-    }
-    if (config.optimizer() == OptimizerConfig::Adam) {
-      LOG(INFO) << "creating Adam optimizer";
-      return new AdamOptimizer(parameter,
-                               lr,
-                               config.adam().beta_1(),
-                               config.adam().beta_2(),
-                               config.adam().epsilon(),
-                               config.adam().decay());
-    }
-    // default
-    LOG(WARNING)
-        << "have not select any Optimizer. use SGDOptimizer in default";
-    return new SGDOptimizer(parameter, lr, 0.0, 0.0, false);
-  };
-  return select_optimizer(parameter, config);
-}
-
-float *ParameterOptimizer::get_weight(int *param_size) const {
-  *param_size = (int)parameter_->size();
-  return parameter_->get_buffer();
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/parameter_optimizer.h b/paddle/legacy/optimizer/parameter_optimizer.h
deleted file mode 100644
index d5abca82d..000000000
--- a/paddle/legacy/optimizer/parameter_optimizer.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <functional>
-#include <string>
-#include "OptimizerConfig.pb.h"
-#include "lr_policy.h"
-#include "serialization.h"
-#include "tensor.h"
-
-namespace paddle {
-namespace optimizer {
-
-class ParameterOptimizer {
- public:
-  /**
-   * @brief  update hook for algorithm need to traverse parameter more than
-   * once.
-   */
-  ParameterOptimizer(Tensor *parameter, LrPolicy *lr)
-      : parameter_(parameter), lr_policy_(lr), num_sample_passed_(0) {}
-  virtual ~ParameterOptimizer() {
-    delete parameter_;
-    delete lr_policy_;
-  }
-
-  static ParameterOptimizer *Create(const std::string &config_proto,
-                                    Tensor *parameter);
-  virtual void Update(const Tensor *gradient) = 0;
-  virtual float *get_weight(int *param_size) const;
-  virtual std::string SerializeState() = 0;
-  virtual void DeserializeState(const std::string &state) = 0;
-
- protected:
-  Tensor *parameter_;
-  // learning rate policy
-  LrPolicy *lr_policy_;
-  uint64_t num_sample_passed_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/parameter_optimizer_test.cc b/paddle/legacy/optimizer/parameter_optimizer_test.cc
deleted file mode 100644
index 1d9572999..000000000
--- a/paddle/legacy/optimizer/parameter_optimizer_test.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "parameter_optimizer.h"
-#include <cmath>
-#include <map>
-#include <vector>
-#include "gtest/gtest.h"
-#include "lr_policy.h"
-
-paddle::optimizer::Tensor* FillTensor(size_t size) {
-  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
-  paddle::optimizer::Tensor& p = *param;
-  for (size_t i = 0; i < p.size(); ++i) {
-    p[i] = (float)rand() / (float)RAND_MAX;
-  }
-  return param;
-}
-
-paddle::optimizer::Tensor* FixedTensor(size_t size) {
-  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
-  paddle::optimizer::Tensor& p = *param;
-  for (size_t i = 0; i < p.size(); ++i) {
-    p[i] = i;
-  }
-  return param;
-}
-
-class OptimizerTest : public testing::Test {
- public:
-  virtual ~OptimizerTest() {}
-  // init paddle::optimizer::Tensor shape
-  const size_t kSize = 5;
-
-  virtual void SetUp() {
-    CreateSGD();
-    CreateAdam();
-  }
-  virtual void TearDown() {}
-
-  void CreateSGD() {
-    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(paddle::OptimizerConfig::SGD);
-    config_.mutable_sgd()->set_momentum(0.0);
-    config_.mutable_sgd()->set_decay(0.0);
-    config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(paddle::OptimizerConfig::Const);
-    config_.mutable_const_lr()->set_learning_rate(0.1);
-    std::string str = config_.SerializeAsString();
-    paddle::optimizer::ParameterOptimizer* opt =
-        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
-    opts_.push_back(opt);
-  }
-
-  void CreateAdam() {
-    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(paddle::OptimizerConfig::Adam);
-    config_.mutable_adam()->set_beta_1(0.9);
-    config_.mutable_adam()->set_beta_2(0.1);
-    config_.mutable_adam()->set_epsilon(1e-3);
-    config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(paddle::OptimizerConfig::Const);
-    config_.mutable_const_lr()->set_learning_rate(0.1);
-    std::string str = config_.SerializeAsString();
-    paddle::optimizer::ParameterOptimizer* opt =
-        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
-    opts_.push_back(opt);
-  }
-
-  void TestGetWeight() {
-    paddle::optimizer::Tensor* p = FixedTensor(kSize);
-    for (size_t i = 0; i < opts_.size(); ++i) {
-      int s = 0;
-      float* newp = (float*)opts_[i]->get_weight(&s);
-      EXPECT_EQ(static_cast<size_t>(s), kSize);
-      for (size_t j = 0; j < kSize; ++j) {
-        EXPECT_EQ(newp[j], (*p)[j]);
-      }
-    }
-  }
-
-  void TestUpdate() {
-    paddle::optimizer::Tensor* g = FixedTensor(kSize);
-    for (size_t i = 0; i < opts_.size(); ++i) {
-      opts_[i]->Update(g);
-    }
-  }
-
-  void TestCheckPoint() {
-    paddle::optimizer::Tensor* p = FixedTensor(kSize);
-    for (size_t i = 0; i < opts_.size(); ++i) {
-      auto state = opts_[i]->SerializeState();
-      opts_[i]->DeserializeState(state);
-      auto state1 = opts_[i]->SerializeState();
-      opts_[i]->DeserializeState(state);
-      EXPECT_EQ(state, state1);
-
-      int s = 0;
-      float* newp = (float*)opts_[i]->get_weight(&s);
-      EXPECT_EQ(static_cast<size_t>(s), kSize);
-      for (size_t j = 0; j < kSize; ++j) {
-        EXPECT_EQ(newp[j], (*p)[j]);
-      }
-    }
-  }
-
- private:
-  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
-  paddle::OptimizerConfig config_;
-};
-
-TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
-
-TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
-
-TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
diff --git a/paddle/legacy/optimizer/serialization.h b/paddle/legacy/optimizer/serialization.h
deleted file mode 100644
index 2067a8d8c..000000000
--- a/paddle/legacy/optimizer/serialization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include "OptimizerConfig.pb.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "tensor.h"
-
-namespace paddle {
-namespace optimizer {
-
-static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
-  proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
-  std::stringstream os;
-  for (size_t i = 0; i < tensor.size(); ++i) {
-    os << tensor[i];
-    proto->add_content(os.str());
-    os.str(std::string());
-  }
-}
-
-static void ProtoToTensor(const TensorProto& proto, Tensor* tensor) {
-  std::stringstream sin;
-  for (auto i = 0; i < proto.content_size(); ++i) {
-    sin << proto.content(i);
-    sin >> (*tensor)[i];
-    sin.str(std::string());
-    sin.clear();
-  }
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/serialization_test.cc b/paddle/legacy/optimizer/serialization_test.cc
deleted file mode 100644
index 93ee1f492..000000000
--- a/paddle/legacy/optimizer/serialization_test.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "serialization.h"
-#include "gtest/gtest.h"
-
-TEST(TensorToProto, Case1) {
-  paddle::optimizer::Tensor t(3), t1(3);
-  for (size_t i = 0; i < t.size(); ++i) {
-    t[i] = i;
-    t1[i] = 10;
-  }
-
-  paddle::TensorProto proto;
-  paddle::optimizer::TensorToProto(t, &proto);
-  paddle::optimizer::ProtoToTensor(proto, &t1);
-  for (size_t i = 0; i < t1.size(); ++i) {
-    EXPECT_EQ(t1[i], t[i]);
-  }
-}
-
-TEST(TensorToProto, Case2) {
-  paddle::optimizer::Tensor t(1), t1(1);
-  for (size_t i = 0; i < t.size(); ++i) {
-    t[i] = i;
-    t1[i] = 10;
-  }
-
-  paddle::TensorProto proto;
-  paddle::optimizer::TensorToProto(t, &proto);
-  paddle::optimizer::ProtoToTensor(proto, &t1);
-  for (size_t i = 0; i < t1.size(); ++i) {
-    EXPECT_EQ(t1[i], t[i]);
-  }
-}
diff --git a/paddle/legacy/optimizer/sgd_optimizer.cc b/paddle/legacy/optimizer/sgd_optimizer.cc
deleted file mode 100644
index c1e2064de..000000000
--- a/paddle/legacy/optimizer/sgd_optimizer.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "sgd_optimizer.h"
-#include "serialization.h"
-
-namespace paddle {
-namespace optimizer {
-
-void SGDOptimizer::Update(const Tensor *gradient) {
-  num_sample_passed_ += 1;
-  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
-  float velocity = 0.0;
-  Tensor &param = *parameter_;
-  const Tensor &grad = *gradient;
-  Tensor &m = *momentums_;
-  for (size_t i = 0; i < param.size(); ++i) {
-    if (momentum_ == 0.0) {
-      velocity = -learning_rate * grad[i] - learning_rate * decay_ * param[i];
-    } else {
-      m[i] = momentum_ * m[i] - learning_rate * grad[i] -
-             learning_rate * decay_ * param[i];
-      velocity = m[i];
-    }
-    if (nesterov_) {
-      param[i] += momentum_ * velocity - learning_rate * grad[i];
-    } else {
-      param[i] += velocity;
-    }
-  }
-}
-
-std::string SGDOptimizer::SerializeState() {
-  SGDOptimizerState state;
-  state.set_num_sample_passed(num_sample_passed_);
-  std::string lr_str = this->lr_policy_->SerializeState();
-  state.mutable_lr_state()->ParseFromString(lr_str);
-  TensorToProto(*parameter_, state.mutable_parameter());
-  if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums());
-  return state.SerializeAsString();
-}
-
-void SGDOptimizer::DeserializeState(const std::string &str) {
-  SGDOptimizerState state;
-  state.ParseFromString(str);
-  auto lr_state = state.lr_state();
-  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
-  num_sample_passed_ = state.num_sample_passed();
-  ProtoToTensor(state.parameter(), parameter_);
-  if (momentum_ != 0.0) ProtoToTensor(state.momentums(), momentums_);
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/sgd_optimizer.h b/paddle/legacy/optimizer/sgd_optimizer.h
deleted file mode 100644
index a8957cde5..000000000
--- a/paddle/legacy/optimizer/sgd_optimizer.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class SGDOptimizer : public ParameterOptimizer {
- public:
-  SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
-      : ParameterOptimizer(parameter, lr),
-        momentums_(nullptr),
-        momentum_(m),
-        decay_(d),
-        nesterov_(n) {
-    if (momentum_ != 0.0) {
-      size_t size = parameter->size();
-      momentums_ = new Tensor(size);
-    }
-  }
-  virtual ~SGDOptimizer() {
-    if (momentums_) delete momentums_;
-  }
-  void Update(const Tensor* gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string& state);
-
- private:
-  Tensor* momentums_;
-  double momentum_;
-  double decay_;
-  bool nesterov_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/optimizer/tensor.h b/paddle/legacy/optimizer/tensor.h
deleted file mode 100644
index 2e58577d4..000000000
--- a/paddle/legacy/optimizer/tensor.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-/**
- * @brief tensor used by optimizer
- */
-
-#include <string.h>
-#include <memory>
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-namespace optimizer {
-
-template <class T>
-class TensorT {
- public:
-  TensorT(size_t size) : height_(1), width_(size) {
-    // new T[size]() initializes all element to zero value.
-    data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
-    data_ = data_ptr_.get();
-  }
-
-  TensorT(T* data, size_t size)
-      : height_(1), width_(size), data_ptr_(nullptr), data_(data) {}
-
-  TensorT(T* data, size_t h, size_t w)
-      : height_(h), width_(w), data_ptr_(nullptr), data_(data) {}
-
-  virtual ~TensorT() {}
-
-  T* get_buffer() { return this->data_; }
-
-  T& operator[](const size_t idx) {
-    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
-    return data_[idx];
-  }
-  T& operator[](const size_t idx) const {
-    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
-    return data_[idx];
-  }
-  // TODO: replace with tensorshape
-  size_t size() const { return this->width_ * this->height_; }
-
- protected:
-  size_t height_;
-  size_t width_;
-  std::shared_ptr<T> data_ptr_;
-  T* data_;
-};
-
-// TODO(zhihong): design problem of dynamic datatype, need to fix it
-typedef TensorT<float> Tensor;
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Argument.cpp b/paddle/legacy/parameter/Argument.cpp
deleted file mode 100644
index 3f1d599e9..000000000
--- a/paddle/legacy/parameter/Argument.cpp
+++ /dev/null
@@ -1,707 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Argument.h"
-#include "paddle/legacy/math/SparseMatrix.h"
-
-#include <algorithm>
-
-namespace paddle {
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    if (!dest) {
-      dest = src->clone(0, 0, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(src->getHeight(), src->getWidth());
-    }
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    IVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    ICpuGpuVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          int32_t startRow,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startRow + copySize, src->getHeight());
-    int height = copySize;
-    int width = src->getWidth();
-    if (!dest) {
-      dest = src->clone(height, width, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(height, width);
-    }
-    MatrixPtr submat = src->subMatrix(startRow, copySize);
-    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
-      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
-      // First copy it to CPU, and then copy it to the GPU.
-      MatrixPtr tmp = src->clone(height, width, false);
-      tmp->copyFrom(*submat, stream);
-      dest->copyFrom(*tmp, stream);
-    } else {
-      dest->copyFrom(*submat, stream);
-    }
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    int height = copySize;
-    IVector::resizeOrCreate(dest, height, useGpu);
-    dest->copyFrom(src->getData() + startPos, height, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    ICpuGpuVector::resizeOrCreate(dest, copySize, useGpu);
-    dest->copyFrom(*src, startPos, copySize, useGpu, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    size_t height = src->size();
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin(), height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->size());
-    size_t height = copySize;
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin() + startPos, height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
-  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src,
-                                 bool useGpu,
-                                 hl_stream_t stream) {
-  dataId = src.dataId;
-  resizeAndCopy(value, src.value, useGpu, stream);
-  resizeAndCopy(grad, src.grad, useGpu, stream);
-  resizeAndCopy(in, src.in, useGpu, stream);
-  resizeAndCopy(ids, src.ids, useGpu, stream);
-  resizeAndCopy(sequenceStartPositions,
-                src.sequenceStartPositions,
-                false /* useGpu */,
-                stream);
-  if (src.hasSubseq()) {
-    resizeAndCopy(subSequenceStartPositions,
-                  src.subSequenceStartPositions,
-                  false /* useGpu */,
-                  stream);
-  }
-  resizeAndCopy(strs, src.strs, useGpu, stream);
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu) {
-  int32_t size =
-      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return size;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu,
-                                    hl_stream_t stream) {
-  dataId = src.dataId;
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-
-  if (!src.sequenceStartPositions) {
-    // non-sequence input, copy samples directly
-    int32_t startRow = startSeq;
-    resizeAndCopy(in, src.in, startRow, copySize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copySize;
-  } else {
-    // sequence input
-    const int* sequence = src.sequenceStartPositions->getData(false);
-    int32_t startRow = sequence[startSeq];           // sample start from here
-    int32_t endRow = sequence[startSeq + copySize];  // sample end
-    int32_t copyFeatureSize = endRow - startRow;     // num of samples
-    resizeAndCopy(in, src.in, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(sequenceStartPositions,
-                  src.sequenceStartPositions,
-                  startSeq,
-                  copySize + 1,
-                  false,
-                  stream);
-    // modify new sequenceStartPositions
-    int* destSequences = sequenceStartPositions->getMutableData(false);
-    for (int i = 0; i < copySize + 1; i++) {
-      destSequences[i] -= startRow;
-    }
-    CHECK_EQ(destSequences[0], 0);
-    CHECK_EQ(destSequences[copySize], copyFeatureSize);
-    if (src.hasSubseq()) {
-      // sequence has sub-sequence
-      int* subSequence = src.subSequenceStartPositions->getMutableData(false);
-      int32_t subStartSeq = 0;
-      int32_t subEndSeq = 0;
-      int numSubSequences = src.getNumSubSequences();
-      for (int i = 0; i < numSubSequences + 1; i++) {
-        if (subSequence[i] == startRow) {
-          subStartSeq = i;
-        } else if (subSequence[i] == endRow) {
-          subEndSeq = i;
-          break;
-        }
-      }
-      int32_t copySubSize = subEndSeq - subStartSeq;
-      resizeAndCopy(subSequenceStartPositions,
-                    src.subSequenceStartPositions,
-                    subStartSeq,
-                    copySubSize + 1,
-                    false,
-                    stream);
-      // modify new subSequenceStartPositions
-      int* destSubSequences = subSequenceStartPositions->getMutableData(false);
-      for (int i = 0; i < copySubSize + 1; i++) {
-        destSubSequences[i] -= startRow;
-      }
-      CHECK_EQ(destSubSequences[0], 0);
-      CHECK_EQ(destSubSequences[copySubSize], copyFeatureSize);
-    }
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copyFeatureSize;
-  }
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      const std::vector<int>& selectRows,
-                      const std::vector<int>& seqStartPos,
-                      const std::vector<int>& copySize,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  CHECK(!subSequenceStartPositions)
-      << "undefined behavior for subsequence positions";
-
-  size_t batchSize = 0;
-  for (size_t i = 0; i < copySize.size(); ++i)
-    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
-
-  auto copyArg = [batchSize, stream](MatrixPtr& dst,
-                                     MatrixPtr src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
-  };
-
-  auto copyIds = [batchSize, stream](IVectorPtr& dst,
-                                     const IVectorPtr& src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(desStartRow, size)
-        ->copyFrom(*src->subVec(srcStartRow, size), stream);
-  };
-
-  auto copyStrs = [batchSize](SVectorPtr& dst,
-                              const SVectorPtr& src,
-                              int desStartRow,
-                              int srcStartRow,
-                              int size,
-                              bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin() + srcStartRow,
-              src->begin() + srcStartRow + size,
-              dst->begin() + desStartRow);
-  };
-
-  dataId = args[0].dataId;
-  CHECK_NE(seqStartPos.size(), 0UL);
-  int desStartRow = 0;
-  for (size_t i = 0; i < copySize.size(); ++i) {
-    int startPos = seqStartPos[i];
-    int endPos = seqStartPos[i + 1];
-    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
-    for (int j = startPos; j < endPos; ++j) {
-      const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
-                                   << "the same dataId.";
-      const int srcStartRow = selectRows[j];
-      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
-      if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
-      }
-      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
-      desStartRow += copySize[i];
-    }
-  }
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, seqStartPos.size(), useGpu);
-  sequenceStartPositions->copyFrom(
-      seqStartPos.data(), seqStartPos.size(), useGpu);
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  int32_t batchSize = 0;
-  int64_t numSequences = 0;
-  int64_t numSubSequences = 0;
-  for (auto& arg : args) {
-    batchSize += arg.getBatchSize();
-    numSequences += arg.getNumSequences();
-    numSubSequences += arg.getNumSubSequences();
-  }
-
-  auto copyArg = [batchSize, stream](
-      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, src->getHeight());
-    tmpMatrix->copyFrom(*src, stream);
-  };
-
-  auto copyIds = [batchSize, stream](
-      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
-  };
-
-  auto copyStrs = [batchSize](
-      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin(), src->end(), dst->begin() + startRow);
-  };
-
-  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
-                            const ICpuGpuVectorPtr& srcSeq,
-                            int dstNumSequences,
-                            int srcNumSequences,
-                            int& startSequences,
-                            int startRow) {
-    if (srcSeq) {
-      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
-      const int* src = srcSeq->getData(false);
-      int* dest = dstSeq->getMutableData(false);
-      for (int i = 0; i < srcNumSequences + 1; ++i) {
-        dest[i + startSequences] = src[i] + startRow;
-      }
-      startSequences += srcNumSequences;
-    } else {
-      dstSeq.reset();
-    }
-  };
-
-  int startRow = 0;
-  int startSequences = 0;
-  int startSubSequences = 0;
-  dataId = args[0].dataId;
-  for (auto& arg : args) {
-    CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
-                                 << " same dataId";
-    copyArg(in, arg.in, startRow, useGpu);
-    copyArg(value, arg.value, startRow, useGpu);
-    if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
-    copyIds(ids, arg.ids, startRow, useGpu);
-    copySequencePos(sequenceStartPositions,
-                    arg.sequenceStartPositions,
-                    numSequences,
-                    arg.getNumSequences(),
-                    startSequences,
-                    startRow);
-    copySequencePos(subSequenceStartPositions,
-                    arg.subSequenceStartPositions,
-                    numSubSequences,
-                    arg.getNumSubSequences(),
-                    startSubSequences,
-                    startRow);
-    copyStrs(strs, arg.strs, startRow, useGpu);
-    startRow += arg.getBatchSize();
-  }
-}
-
-void Argument::splitByDataId(const std::vector<Argument>& argus,
-                             std::vector<std::vector<Argument>>* arguGroups) {
-  arguGroups->clear();
-  int lastDataId = -1;
-  for (const auto& argu : argus) {
-    if (argu.dataId == -1) {
-      // is -1, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = -1;
-    } else if (argu.dataId != lastDataId) {
-      // not -1, also not equal to last Argument, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = argu.dataId;
-    } else {
-      // not -1, and equal to last Argument, do nothing
-    }
-    arguGroups->back().push_back(argu);
-  }
-}
-
-void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
-  const int* starts = sequenceStartPositions->getData(false);
-  const int* subStarts =
-      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
-  size_t numSequences = getNumSequences();
-  seqInfo->reserve(numSequences);
-  int subSeqEnd = 0;
-  for (size_t i = 0; i < numSequences; ++i) {
-    SeqInfo info;
-    info.seqStart = starts[i];
-    info.subLevelLength = starts[i + 1] - starts[i];
-    info.seqId = i;
-    if (hasSubseq()) {
-      info.subSeqStart = subSeqEnd;
-      while (subStarts[subSeqEnd] < starts[i + 1]) {
-        ++subSeqEnd;
-      }
-      info.topLevelLength = subSeqEnd - info.subSeqStart;
-    } else {
-      info.topLevelLength = info.subLevelLength;
-      info.subSeqStart = 0;  // not used
-    }
-    seqInfo->push_back(info);
-  }
-  std::sort(
-      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
-        return a.topLevelLength > b.topLevelLength;
-      });
-}
-
-void Argument::checkSubset() const {
-  if (getNumSequences() > getNumSubSequences()) {
-    LOG(FATAL) << "numSubSequences is less than numSequences ("
-               << getNumSubSequences() << " vs. " << getNumSequences() << ")";
-  }
-  const int* start = sequenceStartPositions->getData(false);
-  const int* subStart = subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  int subSeqId = 0;
-  while (seqId < getNumSequences() && subSeqId < getNumSubSequences()) {
-    if (start[seqId] > subStart[subSeqId]) {
-      ++subSeqId;
-    } else if (start[seqId] == subStart[subSeqId]) {
-      ++subSeqId;
-      ++seqId;
-    } else {
-      LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-    }
-  }
-  if (seqId < getNumSequences()) {
-    LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-  }
-}
-
-void Argument::degradeSequence(const Argument& input) {
-  CHECK_EQ(input.hasSubseq(), 1UL);
-  size_t numSequences = input.getNumSequences();
-  size_t numSubSequences = input.getNumSubSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  const int* subStarts = input.subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  for (size_t subSeqId = 0; subSeqId < numSubSequences; ++subSeqId) {
-    if (subStarts[subSeqId] == starts[seqId]) {
-      tgtBuf[seqId] = subSeqId;
-      seqId++;
-    }
-  }
-  tgtBuf[numSequences] = numSubSequences;
-}
-
-void Argument::poolSequenceWithStride(const Argument& input,
-                                      size_t stride,
-                                      ICpuGpuVectorPtr* stridePostions,
-                                      bool reversed) {
-  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
-  // then sequenceStartPositions = [0, 2, 3, 4, 7].
-  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
-  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
-
-  CHECK(input.sequenceStartPositions);
-  CHECK_EQ(input.hasSubseq(), 0UL);
-  CHECK_GT(stride, 0UL) << "stride must larger than 0";
-  size_t numSequences = input.getNumSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  // first index of target sequence and stride positions are both 0
-  tgtBuf[0] = 0;
-  std::vector<int> stridePos;
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    size_t seqLength = starts[seqId + 1] - starts[seqId];
-    stridePos.emplace_back(starts[seqId]);
-    if (seqLength == 0) {
-      // empty sequence
-      tgtBuf[seqId + 1] = tgtBuf[seqId];
-    } else {
-      int size = ceil((float)seqLength / stride);
-      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
-      for (int i = 0; i < size - 1; ++i) {
-        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
-                           : stridePos.back() + stride;
-        stridePos.emplace_back(cur);
-      }
-    }
-  }
-  stridePos.emplace_back(starts[numSequences]);
-  int size = stridePos.size();
-  CHECK_EQ(size - 1, tgtBuf[numSequences]);
-  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
-  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
-}
-
-void Argument::getValueString(
-    std::unordered_map<std::string, std::string>* out) const {
-  if (value) {
-    std::ostringstream os;
-    value->print(os);
-    out->insert({"value", os.str()});
-  }
-  if (ids) {
-    std::ostringstream os;
-    ids->print(os, ids->getSize());
-    out->insert({"ids", os.str()});
-  }
-  if (sequenceStartPositions) {
-    std::ostringstream os;
-    sequenceStartPositions->getVector(false)->print(
-        os, sequenceStartPositions->getSize());
-    out->insert({"sequence pos", os.str()});
-  }
-  if (subSequenceStartPositions) {
-    std::ostringstream os;
-    subSequenceStartPositions->getVector(false)->print(
-        os, subSequenceStartPositions->getSize());
-    out->insert({"sub-sequence pos", os.str()});
-  }
-}
-
-void Argument::printValueString(std::ostream& stream,
-                                const std::string& prefix) const {
-  std::unordered_map<std::string, std::string> out;
-  getValueString(&out);
-  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
-    auto it = out.find(field);
-    if (it != out.end()) {
-      stream << prefix << field << ":\n" << it->second;
-    }
-  }
-}
-
-void Argument::subArgFrom(const Argument& input,
-                          size_t offset,
-                          size_t height,
-                          size_t width,
-                          bool useGpu,
-                          bool trans,
-                          bool seqFlag,
-                          size_t seqStart,
-                          size_t seqSize) {
-  if (input.value) {
-    value = Matrix::create(
-        input.value->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (input.ids) {
-    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
-  }
-  if (input.grad) {
-    grad = Matrix::create(
-        input.grad->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (seqFlag) {
-    sequenceStartPositions = std::make_shared<ICpuGpuVector>(
-        *(input.sequenceStartPositions), seqStart, seqSize);
-  }
-}
-
-void Argument::reorganizeSeqInfo(
-    const ICpuGpuVectorPtr seqStartPos,
-    const ICpuGpuVectorPtr subSeqStartPos,
-    std::vector<std::vector<int>>& reorganizedSeqInfo) {
-  CHECK(seqStartPos);
-  reorganizedSeqInfo.clear();
-
-  int seqNum = seqStartPos->getSize() - 1;
-  int* seqStarts = seqStartPos->getMutableData(false);
-
-  if (subSeqStartPos) {
-    int* subSeqStarts = subSeqStartPos->getMutableData(false);
-    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
-    int seqIdx = 0;
-    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
-      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
-        seqIdx++;
-        if (seqIdx == seqNum) return;
-        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      }
-    }
-  } else {
-    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
-    memcpy(reorganizedSeqInfo[0].data(),
-           seqStarts,
-           sizeof(int) * seqStartPos->getSize());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Argument.h b/paddle/legacy/parameter/Argument.h
deleted file mode 100644
index ea8634896..000000000
--- a/paddle/legacy/parameter/Argument.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "hl_gpu.h"
-
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
-
-struct Argument {
-  Argument()
-      : in(nullptr),
-        value(nullptr),
-        ids(nullptr),
-        grad(nullptr),
-        strs(nullptr),
-        frameHeight(0),
-        frameWidth(0),
-        frameDepth(0),
-        sequenceStartPositions(nullptr),
-        subSequenceStartPositions(nullptr),
-        cpuSequenceDims(nullptr),
-        deviceId(-1),
-        allCount(0),
-        valueCount(0),
-        gradCount(0),
-        dataId(0) {}
-  Argument(const Argument& argument) {
-    *this = argument;
-    valueCount = 0;
-    gradCount = 0;
-    dataId = argument.dataId;
-  }
-  ~Argument() {}
-
-  void operator=(const Argument& argument) {
-    in = argument.in;
-    value = argument.value;
-    ids = argument.ids;
-    grad = argument.grad;
-    strs = argument.strs;
-    sequenceStartPositions = argument.sequenceStartPositions;
-    subSequenceStartPositions = argument.subSequenceStartPositions;
-    cpuSequenceDims = argument.cpuSequenceDims;
-    deviceId = argument.deviceId;
-    allCount = argument.allCount;
-    frameHeight = argument.frameHeight;
-    frameWidth = argument.frameWidth;
-    frameDepth = argument.frameDepth;
-    dataId = argument.dataId;
-  }
-
-  MatrixPtr in;  // used if needed
-  MatrixPtr value;
-  IVectorPtr ids;  // a sequence of ids. Can be use for class id for costLayer
-  MatrixPtr grad;  // If empty, gradient is not needed.
-  SVectorPtr strs;
-
-  // A dataBatch includes batchSize frames, one frame maybe not only vector
-  size_t frameHeight;
-  size_t frameWidth;
-  size_t frameDepth;
-
-  // If NULL, each position is treated independently.
-  // Otherwise, its size should be #NumberOfSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr sequenceStartPositions;
-
-  // If NULL, each sequence has no subsequence.
-  // Otherwise, its size should be #NumberOfSubSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr subSequenceStartPositions;
-
-  // dimension of sequence, stored only in CPU
-  IVectorPtr cpuSequenceDims;
-
-  int deviceId;            // the GPU device id which the argument in
-  int allCount;            // the number of output layers using this argument
-  mutable int valueCount;  // waiting this member when layer do forward
-  mutable int gradCount;   // waiting this member when layer do backward
-  mutable LockedCondition valueReadyCond;
-  mutable LockedCondition gradReadyCond;
-
-  int dataId;  // dataProvider id
-
-  /* Increase the reference count of the argument. */
-  void countIncrement() { allCount++; }
-
-  int getAllCount() const { return allCount; }
-
-  void waitValueReady() const {
-    valueReadyCond.wait([this] { return (valueCount != 0); });
-
-    std::lock_guard<std::mutex> guard(*valueReadyCond.mutex());
-    valueCount--;
-  }
-
-  void notifyValueReady() const {
-    valueReadyCond.notify_all([this] { valueCount = allCount; });
-  }
-
-  void waitGradReady() const {
-    gradReadyCond.wait([this] { return (gradCount == allCount); });
-    gradCount = 0;
-  }
-
-  void notifyGradReady() const {
-    gradReadyCond.notify_all([this] { gradCount++; });
-  }
-
-  int64_t getBatchSize() const {
-    if (value) return value->getHeight();
-    if (ids) return ids->getSize();
-    if (grad) return grad->getHeight();
-    if (in) return in->getHeight();
-    if (strs) return strs->size();
-    return 0;
-  }
-  size_t getFrameHeight() const { return frameHeight; }
-  size_t getFrameWidth() const { return frameWidth; }
-  size_t getFrameDepth() const { return frameDepth; }
-  void setFrameHeight(size_t h) { frameHeight = h; }
-  void setFrameWidth(size_t w) { frameWidth = w; }
-  void setFrameDepth(size_t d) { frameDepth = d; }
-
-  int64_t getNumSequences() const {
-    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
-                                  : getBatchSize();
-  }
-
-  int64_t getNumSubSequences() const {
-    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
-                                     : getBatchSize();
-  }
-
-  bool hasSeq() const { return sequenceStartPositions != nullptr; }
-  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
-
-  const int* getCpuStartPositions() const {
-    return hasSubseq() ? subSequenceStartPositions->getData(false)
-                       : sequenceStartPositions->getData(false);
-  }
-
-  static inline real sum(const std::vector<Argument>& arguments) {
-    real cost = 0;
-    for (auto& arg : arguments) {
-      if (arg.value) {
-        SetDevice device(arg.deviceId);
-        cost += arg.value->getSum();
-      }
-    }
-    return cost;
-  }
-
-  /**
-   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
-   *        input. Note that, output share the same memory of input.
-   *
-   * @param input[in]       input
-   * @param offset[in]      offset in terms of rows
-   * @param height[in]      height of output.value
-   * @param width[in]       width of output.value
-   * @param useGpu[in]
-   * @param trans[in]       whether input.value is transform
-   * @param seqFlag[in]     whether input has sequenceStartPositions
-   * @param seqStart[in]    offset of input.sequenceStartPositions
-   * @param seqSize[in]     lenght of output.sequenceStartPositions
-   */
-  void subArgFrom(const Argument& input,
-                  size_t offset,
-                  size_t height,
-                  size_t width,
-                  bool useGpu,
-                  bool trans = false,
-                  bool seqFlag = false,
-                  size_t seqStart = 0,
-                  size_t seqSize = 0);
-  /*
-   * for sequence input:
-   *   startSeq: the sequence id of start
-   *   copySize: how many sequences need to copy
-   *   return value: how many samples are copied
-   * for non-sequence input:
-   *   startSeq: the sample id of start
-   *   copySize: how many samples need to copy
-   *   return value: how many samples are copied
-   * Note that when specifying the stream explicitly in this case,
-   * synchronize should also be called somewhere after this function
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu,
-                            hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu = FLAGS_use_gpu);
-
-  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
-
-  /*
-    @brief Concatenate several arguments into one and put the result into it.
-    @param args : a vector of argument, each element of which is a frame in a
-    batch of sequences.
-    @param selectRows : select several row of args to concatenate
-    @param seqStartPos : sequence start positions in the final Argument
-    @param hl_stream_t : cuda stream
-    @param passTyoe : type of task, training or testing
-   */
-  void concat(const std::vector<Argument>& args,
-              const std::vector<int>& selectRows,
-              const std::vector<int>& seqStartPos,
-              const std::vector<int>& copySize,
-              bool useGpu,
-              hl_stream_t stream,
-              PassType passType);
-
-  /*
-    Concatenate several args into one and put the result into this.
-   */
-  void concat(const std::vector<Argument>& src,
-              bool useGpu = FLAGS_use_gpu,
-              hl_stream_t stream = HPPL_STREAM_DEFAULT,
-              PassType passType = PASS_TEST);
-
-  /*
-   * split vector<Argument> to several vectors according to dataId
-   */
-  static void splitByDataId(const std::vector<Argument>& argus,
-                            std::vector<std::vector<Argument>>* arguGroups);
-
-  struct SeqInfo {
-    // Equal to sequence length for sequence data
-    // Equal to number of subsequences for subsequence data
-    int topLevelLength;
-
-    int seqStart;
-    int seqId;
-
-    // Equal to topLevelLength for sequence data
-    // Equal to sum of the length of subsequences for subsequence data
-    int subLevelLength;
-
-    // Only used for subsequence data, start position of this sequence
-    // is subSequenceStartPositions, i.e.
-    // subSequenceStartPositions[subSeqStart] == seqStart
-    int subSeqStart;
-  };
-  /*
-    Get SeqInfo for each sequence of this argument
-    Elements in *seqInfo are sorted by topLevelLength in descending order
-  */
-  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
-
-  /*
-   Check Whether sequenceStartPositions is subset of
-   subSequenceStartPositions.
-   */
-  void checkSubset() const;
-
-  /*
-   sequence has sub-sequence degrades to a sequence.
-   */
-  void degradeSequence(const Argument& input);
-
-  /*
-   After pooling with stride n (n is smaller than sequence length),
-   a long sequence will be shorten.
-   This function is invalid for sequence having sub-sequence.
-   */
-  void poolSequenceWithStride(const Argument& input,
-                              size_t stride,
-                              ICpuGpuVectorPtr* stridePositions,
-                              bool reversed = false);
-  /**
-   * @brief getValueString will return the argument's output in string. There
-   * are several kinds of output. The keys of output dictionary are 'value',
-   * 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param out [out]: the return values.
-   */
-  void getValueString(std::unordered_map<std::string, std::string>* out) const;
-
-  /**
-   * @brief printValueString will print the argument's output in order of
-   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param stream: Output stream
-   * @param prefix: line prefix for printing.
-   */
-  void printValueString(std::ostream& stream,
-                        const std::string& prefix = "") const;
-
-  /**
-   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
-   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
-   *
-   * @param seqStartPos: sequenceStartPositions of an Argument.
-   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
-   * @param the reorganized sequence start position information.
-   *
-   * Examples:
-   * seqStartPos: [0, 4, 15, 20, 28]
-   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
-   * reorganizedSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   */
-  static void reorganizeSeqInfo(
-      const ICpuGpuVectorPtr seqStartPos,
-      const ICpuGpuVectorPtr subSeqStartPos,
-      std::vector<std::vector<int>>& reorganizedSeqInfo);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/AverageOptimizer.cpp b/paddle/legacy/parameter/AverageOptimizer.cpp
deleted file mode 100644
index 82a7fed6c..000000000
--- a/paddle/legacy/parameter/AverageOptimizer.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AverageOptimizer.h"
-
-namespace paddle {
-
-// factory method to create an instance of AverageOptimizer
-ParameterOptimizer* AverageOptimizer::create(
-    const OptimizationConfig& optConfig,
-    ParameterOptimizer* optimizer,
-    bool isParameterSparse,
-    bool useParameterApply) {
-  if (optConfig.average_window() <= 0) {
-    return optimizer;
-  }
-  // disable average for embeded local updater
-  if (!useParameterApply && optConfig.num_batches_per_send_parameter() > 1) {
-    return optimizer;
-  }
-  if (isParameterSparse) {
-    return new AverageSparseOptimizer(optConfig, optimizer, useParameterApply);
-  }
-  return new AverageOptimizer(optConfig, optimizer, useParameterApply);
-}
-
-AverageOptimizer::AverageOptimizer(const OptimizationConfig& optConfig,
-                                   ParameterOptimizer* optimizer,
-                                   bool useParameterApply)
-    : ParameterOptimizer(optConfig),
-      optimizer_(optimizer),
-      useApply_(useParameterApply),
-      numUpdates_(0),
-      prevNumUpdates_(0),
-      numAccumulates_(0),
-      oldNumAccumulates_(0),
-      minAverageWindow_(
-          std::min<int64_t>(10000L, optConfig_.max_average_window())),
-      maxAverageWindow_(optConfig_.max_average_window()) {
-  parameterTypes_ = optimizer_->getParameterTypes();
-  addParameterType(PARAMETER_SUM1);
-  addParameterType(PARAMETER_SUM2);
-  addParameterType(PARAMETER_SUM3);
-  if (useParameterApply) {
-    addParameterType(PARAMETER_APPLY);
-  }
-}
-
-void AverageOptimizer::startBatch(int64_t numSamplesProcessed) {
-  optimizer_->startBatch(numSamplesProcessed);
-  learningRate_ = optimizer_->getLearningRate();
-
-  ++numUpdates_;
-  ++numAccumulates_;
-}
-
-/*
-  After traversal, the averaged parameter can be obtained by
-  ((PARAMETER_SUM1 + PARAMETER_SUM2 + PARAMETER_SUM3)
-  / (numAccumulates_ + oldNumAccumulates_))
-*/
-ParameterOptimizer::TraverseCallback AverageOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->needSpecialTraversal(config)) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (numUpdates_ % kMaxNumAccumulates == 0) {
-    // Move the sum to a different buffer to avoid loss of precision
-    // due to too many sums.
-    callbacks.emplace_back([](const VectorPtr vecs[],
-                              const ParameterConfig& config,
-                              size_t sparseId) {
-      vecs[PARAMETER_SUM2]->add(*vecs[PARAMETER_SUM1]);
-      vecs[PARAMETER_SUM1]->zeroMem();
-    });
-  }
-
-  if (isAverageWindowTooLong()) {
-    // Now the average window is too long, discard the old sum.
-    if (auto callback = this->startCatchUpWith()) {
-      callbacks.emplace_back(callback);
-    }
-    callbacks.emplace_back([](const VectorPtr vecs[],
-                              const ParameterConfig& config,
-                              size_t sparseId) {
-      vecs[PARAMETER_SUM3]->add(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2]);
-      vecs[PARAMETER_SUM1]->zeroMem();
-      vecs[PARAMETER_SUM2]->zeroMem();
-    });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void AverageOptimizer::finishBatch() {
-  optimizer_->finishBatch();
-  if (isAverageWindowTooLong()) {
-    this->finishCatchUpWith();
-    oldNumAccumulates_ = numAccumulates_;
-    numAccumulates_ = 0;
-  }
-}
-
-ParameterOptimizer::TraverseCallback AverageOptimizer::apply() {
-  if (numAccumulates_ + oldNumAccumulates_ == 0) {
-    return nullptr;
-  }
-
-  real scale = 1. / (numAccumulates_ + oldNumAccumulates_);
-  if (useApply_) {
-    return [scale](const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) {
-      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1],
-                                  *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3],
-                                  scale,
-                                  scale,
-                                  scale);
-    };
-  } else {
-    return [scale](const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) {
-      vecs[PARAMETER_GRADIENT]->copyFrom(*vecs[PARAMETER_VALUE]);
-      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1],
-                                  *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3],
-                                  scale,
-                                  scale,
-                                  scale);
-    };
-  }
-}
-
-ParameterOptimizer::TraverseCallback AverageOptimizer::restore() {
-  if (numAccumulates_ + oldNumAccumulates_ == 0) {
-    return nullptr;
-  }
-  if (useApply_) {
-    return nullptr;
-  }
-
-  return [](
-      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) {
-    vecs[PARAMETER_VALUE]->copyFrom(*vecs[PARAMETER_GRADIENT]);
-    vecs[PARAMETER_GRADIENT]->zeroMem();
-  };
-}
-
-void AverageSparseOptimizer::update(const VectorPtr vecs[],
-                                    const ParameterConfig& paraConfig,
-                                    size_t sparseId) const {
-  optimizer_->update(vecs, paraConfig, sparseId);
-
-  CHECK_LT(sparseId, t0Vec_.size());
-  int timediff = timer_ + 1 - t0Vec_[sparseId];
-  if (timediff > 0) {
-    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-}
-
-ParameterOptimizer::TraverseCallback AverageSparseOptimizer::startCatchUpWith()
-    const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->startCatchUpWith()) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (timer_ > 0) {
-    callbacks.emplace_back(
-        [this](const VectorPtr vecs[],
-               const ParameterConfig& config,
-               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void AverageSparseOptimizer::catchUpWith(const VectorPtr vecs[],
-                                         const ParameterConfig& paraConfig,
-                                         size_t sparseId) const {
-  CHECK_LT(sparseId, t0Vec_.size());
-  int timediff = timer_ - t0Vec_[sparseId];
-  if (timediff > 0) {
-    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/AverageOptimizer.h b/paddle/legacy/parameter/AverageOptimizer.h
deleted file mode 100644
index f0fe2fd28..000000000
--- a/paddle/legacy/parameter/AverageOptimizer.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "FirstOrderOptimizer.h"
-
-namespace paddle {
-
-// After Optimization, parameter values are further averaged within
-// time range.
-class AverageOptimizer : public ParameterOptimizer {
- public:
-  // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
-  // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
-  AverageOptimizer(const OptimizationConfig& optConfig,
-                   ParameterOptimizer* optimizer,
-                   bool useParameterApply);
-
-  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
-                                    ParameterOptimizer* optimizer,
-                                    bool isParameterSparse = false,
-                                    bool useParameterApply = false);
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    optimizer_->init(numRows, config);
-  }
-
-  virtual void startPass() { optimizer_->startPass(); }
-  virtual void finishPass() {
-    optimizer_->finishPass();
-    updateAverageWindowLimit();
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed);
-  virtual void finishBatch();
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {
-    optimizer_->update(vecs, paraConfig, sparseId);
-    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], 1.0f);
-  }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-
-  virtual TraverseCallback startCatchUpWith() const {
-    return optimizer_->startCatchUpWith();
-  }
-  virtual void finishCatchUpWith() { return optimizer_->finishCatchUpWith(); }
-
-  virtual TraverseCallback apply();
-  virtual TraverseCallback restore();
-
-  virtual void setNoDecay() { optimizer_->setNoDecay(); }
-
- protected:
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-  bool useApply_;
-
-  // should only be called from finishPass()
-  void updateAverageWindowLimit() {
-    if (!optConfig_.has_max_average_window()) {
-      // use the number of batches in the last pass as maxAverageWindow_
-      CHECK_GT(numUpdates_, prevNumUpdates_);
-      maxAverageWindow_ = numUpdates_ - prevNumUpdates_;
-      prevNumUpdates_ = numUpdates_;
-    }
-    minAverageWindow_ = std::min(minAverageWindow_, numUpdates_);
-  }
-
-  bool isAverageWindowTooLong() const {
-    return numAccumulates_ >= minAverageWindow_ &&
-           numAccumulates_ >=
-               std::min<int64_t>(maxAverageWindow_,
-                                 numUpdates_ * optConfig_.average_window());
-  }
-
-  static const int64_t kMaxNumAccumulates = 16384;
-  int64_t numUpdates_;
-  int64_t prevNumUpdates_;
-  int64_t numAccumulates_;
-  int64_t oldNumAccumulates_;
-  int64_t minAverageWindow_;
-  int64_t maxAverageWindow_;
-};
-
-// Average Optimizer with Sparse support.
-class AverageSparseOptimizer : public AverageOptimizer {
- public:
-  AverageSparseOptimizer(const OptimizationConfig& optConfig,
-                         ParameterOptimizer* optimizer,
-                         bool useParameterApply)
-      : AverageOptimizer(optConfig, optimizer, useParameterApply) {}
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    AverageOptimizer::init(numRows, config);
-
-    t0Vec_.resize(numRows);
-
-    timer_ = 0;
-    t0Vec_.assign(t0Vec_.size(), 0);
-  }
-  virtual void finishBatch() {
-    AverageOptimizer::finishBatch();
-    timer_++;
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[],
-                   const ParameterConfig& paraConfig,
-                   size_t sparseId) const;
-  virtual TraverseCallback startCatchUpWith() const;
-  virtual void finishCatchUpWith() {
-    optimizer_->finishCatchUpWith();
-
-    timer_ = 0;
-    t0Vec_.assign(t0Vec_.size(), 0);
-  }
-
- protected:
-  /**
-   *  counting batches, clear after catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int timer_;
-  mutable std::vector<int32_t> t0Vec_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/CMakeLists.txt b/paddle/legacy/parameter/CMakeLists.txt
deleted file mode 100644
index 19ae07e07..000000000
--- a/paddle/legacy/parameter/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# The utilities for paddle
-
-file(GLOB PARAMETERS_HEADERS . *.h)
-file(GLOB PARAMETERS_SOURCES . *.cpp)
-
-add_library(paddle_parameter STATIC
-        ${PARAMETERS_SOURCES})
-add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/parameter/FirstOrderOptimizer.cpp b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
deleted file mode 100644
index 4f82a115f..000000000
--- a/paddle/legacy/parameter/FirstOrderOptimizer.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FirstOrderOptimizer.h"
-#include "paddle/legacy/math/TrainingAlgorithmOp.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <cmath>
-
-DEFINE_bool(log_clipping, false, "enable log clipping or not");
-
-namespace paddle {
-
-SparseMomentumParameterOptimizer::SparseMomentumParameterOptimizer(
-    const OptimizationConfig& optConfig)
-    : ParameterOptimizer(optConfig) {
-  addParameterType(PARAMETER_MOMENTUM);
-  addParameterType(PARAMETER_MOMENTUM_UT);
-  addParameterType(PARAMETER_MOMENTUM_VT);
-  alpha_ = 1;
-  beta_ = 1;
-  tau_ = -1;
-  threshold_ = 1e+06;
-}
-
-void SparseMomentumParameterOptimizer::init(size_t numRows,
-                                            const ParameterConfig* config) {
-  isParameterSparse_ = numRows != 0;
-  t0Vec_.resize(numRows);
-  t0Vec_.assign(t0Vec_.size(), 0);
-  timer_ = 0;
-  momentum_ = config->momentum();
-  decayRate_ = config->decay_rate();
-  gamma_ = config->learning_rate();
-}
-
-void SparseMomentumParameterOptimizer::startBatch(int64_t numSamplesProcessed) {
-  learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  if (isParameterSparse_) {
-    tau_ = tau_ + beta_ / alpha_;
-    alpha_ = alpha_ / momentum_;
-    beta_ = beta_ / (1 + decayRate_ * gamma_ * learningRate_);
-  }
-}
-
-void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& paraConfig,
-                                              size_t sparseId) const {
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    if (t0Vec_[sparseId] == 0) {
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-      t0Vec_[sparseId] = 1;
-    }
-    vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                     -alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                     tau_ * alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                               tau_ / beta_ + 1.0 / alpha_,
-                               *vecs[PARAMETER_MOMENTUM_VT],
-                               1.0 / beta_);
-
-  } else {
-    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                     *vecs[PARAMETER_MOMENTUM],
-                                     learningRate_ * paraConfig.learning_rate(),
-                                     paraConfig.momentum(),
-                                     applyDecay_ ? paraConfig.decay_rate() : 0);
-  }
-}
-
-ParameterOptimizer::TraverseCallback
-SparseMomentumParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (alpha_ > threshold_ && isParameterSparse_) {
-    //  Restart to avoid large value multiplication
-    //  1. \alpha = 1, \beta = 1, \tau = 0
-    //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
-    //     u_t should be rescaled to u_t/alpha_
-    //     v_t should be reset to \theta_t
-    return [this](const VectorPtr vecs[],
-                  const ParameterConfig& config,
-                  size_t sparseId) {
-      vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void SparseMomentumParameterOptimizer::finishBatch() {
-  timer_++;
-  if (!isParameterSparse_) return;
-  if (alpha_ > threshold_) {
-    alpha_ = 1;
-    beta_ = 1;
-    tau_ = -1;
-  }
-}
-
-void AdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adagradApply(value,
-               grad,
-               mom,
-               accum_buffer,
-               accum,
-               lr,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate);
-}
-
-ParameterOptimizer::TraverseCallback
-AdagradParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (numUpdates_ % kMaxNumAccumulates == 0) {
-    // Move the sum to a different buffer to avoid loss of precision
-    // due to too many sums.
-    return [](const VectorPtr vecs[],
-              const ParameterConfig& config,
-              size_t sparseId) {
-      vecs[PARAMETER_GRADIENT_SQURESUM]->add(
-          *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-      vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
-                                        const ParameterConfig& config,
-                                        size_t sparseId) const {
-  CHECK(sparseId == -1LU) << "Sparse update is not supported";
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adadeltaApply(value,
-                grad,
-                mom,
-                accum,
-                accum_update,
-                lr,
-                rou_,
-                epsilon_,
-                learningRate,
-                momentum,
-                decayRate);
-}
-
-void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  rmspropApply(value,
-               grad,
-               mom,
-               sum,
-               sum1,
-               lr,
-               accumulatedRou,
-               rou_,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate,
-               firstTime);
-}
-
-void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& config,
-                                              size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  decayedAdagradApply(value,
-                      grad,
-                      mom,
-                      sum,
-                      lr,
-                      accumulatedRou,
-                      rou_,
-                      epsilon,
-                      learningRate,
-                      momentum,
-                      decayRate,
-                      firstTime);
-}
-
-void AdamParameterOptimizer::update(const VectorPtr vecs[],
-                                    const ParameterConfig& config,
-                                    size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-
-  real beta1_power = std::pow(beta1_, step_);
-  real beta2_power = std::pow(beta2_, step_);
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
-
-  adamApply(value,
-            grad,
-            mom,
-            v,
-            beta1_,
-            beta2_,
-            beta1_power,
-            beta2_power,
-            epsilon_,
-            learningRate);
-}
-
-void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
-                                      const ParameterConfig& config,
-                                      size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
-}
-
-void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
-                                           const ParameterConfig& config,
-                                           size_t sparseId) const {
-  real globalThreshold = optConfig_.gradient_clipping_threshold();
-  real localThreshold = config.gradient_clipping_threshold();
-
-  // Use local gradient clipping threshold if it's enabled,
-  // otherwise using the global one.
-  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
-  std::string field = localThreshold > 0.0f ? "local" : "global";
-
-  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
-  if (maxAbsGrad > threshold) {
-    if (FLAGS_log_clipping) {
-      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
-                        vecs[PARAMETER_GRADIENT]->getSize();
-      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
-                << field << " threshold=" << threshold
-                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
-    }
-    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
-  }
-  optimizer_->update(vecs, config, sparseId);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/FirstOrderOptimizer.h b/paddle/legacy/parameter/FirstOrderOptimizer.h
deleted file mode 100644
index 86b9a591a..000000000
--- a/paddle/legacy/parameter/FirstOrderOptimizer.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterOptimizer.h"
-#include "ParameterUpdateFunctions.h"
-#include "Regularizer.h"
-
-namespace paddle {
-
-// Plain SGD optimization.
-class SgdOptimizer : public ParameterOptimizer {
- public:
-  explicit SgdOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {
-    (void)sparseId;
-    real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
-                                  ? 1.0 - paraConfig.momentum()
-                                  : 1.0;
-#ifdef PADDLE_WITH_MKLDNN
-    sgdUpdate(learningRate_ * paraConfig.learning_rate() *
-                  (firstTime_ ? 1.0 : torch_learningRate),
-              paraConfig.momentum(),
-              applyDecay_ ? paraConfig.decay_rate() : 0,
-              vecs[PARAMETER_VALUE].get(),
-              vecs[PARAMETER_GRADIENT].get(),
-              vecs[PARAMETER_MOMENTUM].get());
-#else
-    vecs[PARAMETER_VALUE]->sgdUpdate(
-        *vecs[PARAMETER_GRADIENT],
-        *vecs[PARAMETER_MOMENTUM],
-        learningRate_ * paraConfig.learning_rate() *
-            (firstTime_ ? 1.0 : torch_learningRate),
-        paraConfig.momentum(),
-        applyDecay_ ? paraConfig.decay_rate() : 0);
-#endif
-  }
-  virtual void finishBatch() { firstTime_ = false; }
-};
-
-// SGD optimization with sparse support.
-class SparseMomentumParameterOptimizer : public ParameterOptimizer {
-  /* sparse momentum optimizer
-
-    update scheme:
-
-    \alpha_t = \alpha_{t-1} / k
-    \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
-    u_t = u_{t-1} - \alpha_t \gamma_t g_t
-    v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
-    \tau_t = \tau_{t-1} + \beta_t / \alpha_t
-
-    where:
-    k: momentum
-    lambda: decay rate
-    \gamma_t: learning rate at the t'th step
-  */
-
- public:
-  explicit SparseMomentumParameterOptimizer(
-      const OptimizationConfig& optConfig);
-  virtual void init(size_t numRows, const ParameterConfig* config);
-  virtual void startBatch(int64_t numSamplesProcessed);
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const;
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-  virtual void finishBatch();
-
- private:
-  real alpha_;
-  real beta_;
-  real tau_;
-  real gamma_;
-  real threshold_;
-  real momentum_;
-  real decayRate_;
-
- protected:
-  int64_t timer_;
-  mutable std::vector<int64_t> t0Vec_;
-  bool isParameterSparse_;
-};
-
-/*
- * AdaGrad optimization.
- * http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
- */
-class AdagradParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit AdagradParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    numUpdates_ = 0;
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    (void)numSamplesProcessed;
-    ++numUpdates_;
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-
- protected:
-  int64_t numUpdates_;
-  static const int64_t kMaxNumAccumulates = 16384;
-};
-
-/*
- * AdaDelta Optimization.
- * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
- */
-class AdaDeltaParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    rou_ = optConfig.ada_rou();
-    epsilon_ = optConfig.ada_epsilon();
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real rou_;
-  real epsilon_;
-};
-
-// RMSProp Parameter Optimization.
-class RMSPropParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit RMSPropParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    rou_ = optConfig.ada_rou();
-    epsilon_ = optConfig.ada_epsilon();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    t0Vec_.resize(numRows);
-    t0Vec_.assign(t0Vec_.size(), 0);
-    timer_ = 0;
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void finishBatch() { timer_++; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real rou_;
-  real epsilon_;
-
-  /**
-   *  counting batches, donot need catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int64_t timer_;
-  mutable std::vector<int64_t> t0Vec_;
-};
-
-// Decayed AdaGrad Optimization.
-class DecayedAdagradParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit DecayedAdagradParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    rou_ = optConfig.ada_rou();
-    epsilon_ = optConfig.ada_epsilon();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    t0Vec_.resize(numRows);
-    t0Vec_.assign(t0Vec_.size(), 0);
-    timer_ = 0;
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void finishBatch() { timer_++; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real rou_;
-  real epsilon_;
-
-  /**
-   *  counting batches, donot need catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int64_t timer_;
-  mutable std::vector<int64_t> t0Vec_;
-};
-
-/**
- * Adam Optimizer.
- * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 1
- */
-class AdamParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit AdamParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig),
-        beta1_(optConfig.adam_beta1()),
-        beta2_(optConfig.adam_beta2()),
-        epsilon_(optConfig.adam_epsilon()),
-        step_(1),
-        learningRate_(optConfig.learning_rate()) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_SECOND_MOMENTUM);
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-
-  virtual void finishBatch() { ++step_; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real beta1_;
-  real beta2_;
-  real epsilon_;
-  int64_t step_;
-  real learningRate_;
-};
-
-/**
- * AdaMax Optimizer.
- * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 2
- */
-class AdamaxParameterOptimizer : public ParameterOptimizer {
- public:
-  explicit AdamaxParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig),
-        beta1_(optConfig.adam_beta1()),
-        beta2_(optConfig.adam_beta2()),
-        step_(1),
-        learningRate_(optConfig.learning_rate()) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_WEIGHTED_INFINITY_NORM);
-  }
-
-  virtual void finishBatch() { ++step_; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
- protected:
-  real beta1_;
-  real beta2_;
-  int64_t step_;
-  real learningRate_;
-};
-
-// Used in pserver,
-// when PARAMETER_DELTA stores in PARAMETER_GRADIENT.
-class AddOptimizer : public ParameterOptimizer {
- public:
-  explicit AddOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {}
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    // learningRate required by regularizer
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {
-    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_GRADIENT],
-                               optConfig_.delta_add_rate());
-  }
-};
-
-// A optimizer which does nothing.
-class DummyOptimizer : public ParameterOptimizer {
- public:
-  explicit DummyOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {}
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {}
-};
-
-// Do gradient clipping before sgd update
-class OptimizerWithGradientClipping : public ParameterOptimizer {
- public:
-  OptimizerWithGradientClipping(const OptimizationConfig& optConfig,
-                                ParameterOptimizer* optimizer)
-      : ParameterOptimizer(optConfig), optimizer_(optimizer) {
-    parameterTypes_ = optimizer_->getParameterTypes();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    optimizer_->init(numRows, config);
-  }
-
-  virtual void startPass() { optimizer_->startPass(); }
-  virtual void finishPass() { optimizer_->finishPass(); }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    optimizer_->startBatch(numSamplesProcessed);
-    learningRate_ = optimizer_->getLearningRate();
-  }
-  virtual void finishBatch() { optimizer_->finishBatch(); }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const {
-    return optimizer_->needSpecialTraversal(config);
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
-  virtual void setNoDecay() { optimizer_->setNoDecay(); }
-
- protected:
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/LearningRateScheduler.cpp b/paddle/legacy/parameter/LearningRateScheduler.cpp
deleted file mode 100644
index 68c44a7ec..000000000
--- a/paddle/legacy/parameter/LearningRateScheduler.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LearningRateScheduler.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-namespace paddle {
-
-ClassRegistrar<LearningRateScheduler, OptimizationConfig>
-    LearningRateScheduler::registrar_;
-
-LearningRateScheduler* LearningRateScheduler::create(
-    const OptimizationConfig& config) {
-  return registrar_.createByType(config.learning_rate_schedule(), config);
-}
-
-// LRS stands for LearningRateScheduler
-
-class BaseLRS : public LearningRateScheduler {
- public:
-  explicit BaseLRS(const OptimizationConfig& config)
-      : learningRate_(config.learning_rate()),
-        a_(config.learning_rate_decay_a()),
-        b_(config.learning_rate_decay_b()) {}
-
- protected:
-  real learningRate_;
-  real a_;
-  real b_;
-};
-
-class ConstLRS : public BaseLRS {
- public:
-  explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRate_;
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
-
-class PolyLRS : public BaseLRS {
- public:
-  explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
-
-class CaffePolyLRS : public BaseLRS {
- public:
-  explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    if (numSamplesProcessed > a_) {
-      LOG_FIRST_N(WARNING, 1)
-          << "Using caffe_poly learning rate schedule, "
-          << "learning rate hits ZERO when "
-          << "numSamplesProcessed > config.learning_rate_decay_b(), "
-          << "training is over and you can stop it. "
-          << "See common/LearningRateScheduler.cpp for more info.";
-      return 0;
-    } else {
-      return learningRate_ * pow(1.0 - numSamplesProcessed / a_, b_);
-    }
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
-
-class ExpLRS : public BaseLRS {
- public:
-  explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    double decayRatio = (double)numSamplesProcessed / b_;
-    return learningRate_ * pow(a_, decayRatio);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
-
-class DiscreteExpLRS : public BaseLRS {
- public:
-  explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    int numDecays = floor(numSamplesProcessed / b_);
-    return learningRate_ * pow(a_, numDecays);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
-
-class LinearLRS : public BaseLRS {
- public:
-  explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
-
-/*
-  specify learning rate through
-  learning_rate_args = 'seg0:rate0,seg1:rate1,...,segK:rateK'
-  if seg_{i-1} <= numSamples <= seg_i,
-  then learning_rate = learning_rate_base * rate_i
-*/
-class ManualLRS : public BaseLRS {
- public:
-  explicit ManualLRS(const OptimizationConfig& config)
-      : BaseLRS(config), currentSegment_(0), lastNum_(0) {
-    std::vector<std::string> pieces;
-    str::split(config.learning_rate_args(), ',', &pieces);
-    rates_.reserve(pieces.size());
-    std::string s1, s2;
-
-    for (auto& piece : pieces) {
-      auto pos = piece.find(':');
-      CHECK(pos != std::string::npos) << "Wrong format for learning_rate_args: "
-                                      << config.learning_rate_args();
-      segments_.push_back(str::to<int64_t>(piece.substr(0, pos)));
-      rates_.push_back(str::to<real>(piece.substr(pos + 1)));
-    }
-  }
-
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return calc(numSamplesProcessed);
-  }
-
-  real calc(int64_t num) {
-    // We assume that num never decreases.
-    CHECK_LE(lastNum_, num);
-    lastNum_ = num;
-    while (currentSegment_ < rates_.size()) {
-      if (num <= segments_[currentSegment_]) {
-        return learningRate_ * rates_[currentSegment_];
-      }
-      ++currentSegment_;
-      if (currentSegment_ < rates_.size()) {
-        LOG(INFO) << " learning_rate changes to "
-                  << learningRate_ * rates_[currentSegment_];
-      }
-    }
-    return learningRate_ * rates_.back();
-  }
-
- protected:
-  std::vector<real> rates_;
-  std::vector<int64_t> segments_;
-  size_t currentSegment_;
-  int64_t lastNum_;
-};
-
-REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
-
-class PassManualLRS : public ManualLRS {
- public:
-  explicit PassManualLRS(const OptimizationConfig& config)
-      : ManualLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return calc(pass);
-  }
-};
-
-REGISTER_LEARNING_RATE_SCHEDULER(pass_manual, PassManualLRS);
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/LearningRateScheduler.h b/paddle/legacy/parameter/LearningRateScheduler.h
deleted file mode 100644
index fc7e380a6..000000000
--- a/paddle/legacy/parameter/LearningRateScheduler.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TrainerConfig.pb.h"
-#include "paddle/legacy/utils/ClassRegistrar.h"
-
-namespace paddle {
-// NOLINTNEXTLINES_4
-#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \
-  static InitFunction __reg_type_##__type_name([]() {               \
-    LearningRateScheduler::registrar_.registerClass<__class_name>(  \
-        #__type_name);                                              \
-  })
-
-class LearningRateScheduler {
- public:
-  static LearningRateScheduler* create(const OptimizationConfig& config);
-  virtual ~LearningRateScheduler() {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
-
-  static ClassRegistrar<LearningRateScheduler, OptimizationConfig> registrar_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/OptimizerFunctions.cpp b/paddle/legacy/parameter/OptimizerFunctions.cpp
deleted file mode 100644
index b7f920b89..000000000
--- a/paddle/legacy/parameter/OptimizerFunctions.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerWithRegularizer.h"
-
-namespace paddle {
-
-// creator for AverageOptimizer
-ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
-                                       const ParameterConfig& paraConfig,
-                                       bool isParameterSparse,
-                                       bool inPserver) {
-  ParameterOptimizer* optimizer = OptimizerWithRegularizer::create(
-      optConfig, paraConfig, isParameterSparse, inPserver);
-  return AverageOptimizer::create(
-      optConfig, optimizer, isParameterSparse, inPserver /*useParameterApply*/);
-}
-
-std::vector<ParameterType> sgdOptimizerGetTypes(
-    const OptimizationConfig& optConfig, bool inPserver) {
-  std::unique_ptr<ParameterOptimizer> optimizer;
-  optimizer.reset(
-      AverageOptimizer::create(optConfig,
-                               ParameterOptimizer::create(optConfig, inPserver),
-                               false /*isParameterSparse*/,
-                               inPserver));
-  CHECK(optimizer) << "fail to create optimizer: "
-                   << optConfig.learning_method();
-  return optimizer->getParameterTypes();
-}
-
-bool useApplyInPserver(const OptimizationConfig& optConfig) {
-  auto types = sgdOptimizerGetTypes(optConfig, true /*inPserver*/);
-  return types.end() != std::find(types.begin(), types.end(), PARAMETER_APPLY);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/OptimizerFunctions.h b/paddle/legacy/parameter/OptimizerFunctions.h
deleted file mode 100644
index 57f6fc9d4..000000000
--- a/paddle/legacy/parameter/OptimizerFunctions.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "FirstOrderOptimizer.h"
-
-namespace paddle {
-
-/*
- * Factory function creates the corresponding SgdOptimizer
- * according to the configuration in optConfig.
- */
-ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig,
-                                       const ParameterConfig& paraConfig,
-                                       bool isParameterSparse,
-                                       bool inPserver);
-
-/*
- * Get the parameter types needed for the specific optimization
- * algorithm specified in optConfig.
- */
-std::vector<ParameterType> sgdOptimizerGetTypes(
-    const OptimizationConfig& optConfig, bool inPserver);
-
-/*
- * Whether trainer need call apply() in pserver and get result back.
- * currently, only averager depend on this.
- */
-bool useApplyInPserver(const OptimizationConfig& optConfig);
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/OptimizerWithRegularizer.cpp b/paddle/legacy/parameter/OptimizerWithRegularizer.cpp
deleted file mode 100644
index 9e914ae4e..000000000
--- a/paddle/legacy/parameter/OptimizerWithRegularizer.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "OptimizerWithRegularizer.h"
-
-namespace paddle {
-
-ParameterOptimizer::TraverseCallback
-OptimizerWithRegularizerEveryNumBatches::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  TraverseCallbackVec callbacks;
-
-  if (isRegularizationBatch(config)) {
-    callbacks.emplace_back(
-        [this](const VectorPtr vecs[],
-               const ParameterConfig& config,
-               size_t sparseId) { this->doTraversal(vecs, config); });
-  }
-
-  if (auto callback = optimizer_->needSpecialTraversal(config)) {
-    callbacks.emplace_back(callback);
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void OptimizerWithRegularizerEveryNumBatches::doTraversal(
-    const VectorPtr vecs[], const ParameterConfig& config) const {
-  int32_t base =
-      std::max(baseTimer_, (timer_ + 1 - config.num_batches_regularization()));
-  regularizer_->update(
-      vecs, config, optimizer_->getLearningRate(), base, timer_ + 1);
-}
-
-ParameterOptimizer::TraverseCallback
-OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->startCatchUpWith()) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (baseTimer_ < timer_) {
-    callbacks.emplace_back(
-        [this](const VectorPtr vecs[],
-               const ParameterConfig& config,
-               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void OptimizerWithRegularizerEveryNumBatches::catchUpWith(
-    const VectorPtr vecs[],
-    const ParameterConfig& config,
-    size_t sparseId) const {
-  int32_t base = timer_ - timer_ % config.num_batches_regularization();
-  regularizer_->update(vecs,
-                       config,
-                       optimizer_->getLearningRate(),
-                       std::max(base, baseTimer_),
-                       timer_);
-}
-
-void OptimizerWithRegularizerSparse::init(size_t numRows,
-                                          const ParameterConfig* config) {
-  OptimizerWithRegularizer::init(numRows, config);
-  t0Vec_.resize(numRows);
-
-  timer_ = 0;
-  t0Vec_.assign(t0Vec_.size(), 0);
-}
-
-void OptimizerWithRegularizerSparse::update(const VectorPtr vecs[],
-                                            const ParameterConfig& config,
-                                            size_t sparseId) const {
-  optimizer_->update(vecs, config, sparseId);
-  // para W(t0) -> W(t+1)
-  CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(vecs,
-                       config,
-                       optimizer_->getLearningRate(),
-                       t0Vec_[sparseId],
-                       timer_ + 1);
-  t0Vec_[sparseId] = timer_ + 1;
-}
-
-ParameterOptimizer::TraverseCallback
-OptimizerWithRegularizerSparse::startCatchUpWith() const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->startCatchUpWith()) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (timer_ > 0) {
-    callbacks.emplace_back(
-        [this](const VectorPtr vecs[],
-               const ParameterConfig& config,
-               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void OptimizerWithRegularizerSparse::catchUpWith(const VectorPtr vecs[],
-                                                 const ParameterConfig& config,
-                                                 size_t sparseId) const {
-  // para W(t0) -> W(t+1)
-  CHECK_LT(sparseId, t0Vec_.size());
-  regularizer_->update(
-      vecs, config, optimizer_->getLearningRate(), t0Vec_[sparseId], timer_);
-}
-
-// factory method to create instance of OptimizerWithRegularizer
-ParameterOptimizer* OptimizerWithRegularizer::create(
-    const OptimizationConfig& optConfig,
-    const ParameterConfig& paraConfig,
-    bool isParameterSparse,
-    bool inPserver) {
-  ParameterOptimizer* optimizer =
-      ParameterOptimizer::create(optConfig, inPserver);
-  if ((optConfig.gradient_clipping_threshold() > 0.0f ||
-       paraConfig.gradient_clipping_threshold() > 0.0f) &&
-      !dynamic_cast<AddOptimizer*>(optimizer)) {
-    optimizer = new OptimizerWithGradientClipping(optConfig, optimizer);
-  }
-  Regularizer* regularizer =
-      Regularizer::get(optimizer->getParameterTypes(), paraConfig);
-  if (!regularizer) {
-    return optimizer;
-  }
-
-  if (paraConfig.num_batches_regularization() > 1) {
-    if (optConfig.num_batches_per_send_parameter() > 1) {
-      CHECK_EQ(optConfig.num_batches_per_send_parameter() %
-                   paraConfig.num_batches_regularization(),
-               0)
-          << "regularization should be apply in sending batch";
-    }
-    CHECK(paraConfig.momentum() == 0.0f) << "Parameter cannot support momentum "
-                                            "if num_batches_regularization set";
-
-    if (optConfig.center_parameter_update_method() == "average" &&
-        optConfig.num_batches_per_send_parameter() ==
-            paraConfig.num_batches_regularization()) {
-      LOG(INFO) << "decay in pserver and no decay in trainer";
-      if (inPserver) {  // decay in pserver
-        optimizer->setNoDecay();
-        return new OptimizerWithRegularizer(optConfig, optimizer, regularizer);
-      }
-      // no decay in trainer
-      optimizer->setNoDecay();
-      return optimizer;
-    }
-    if (dynamic_cast<AddOptimizer*>(optimizer)) {
-      return optimizer;  // normal average, no decay in pserver
-    }
-    // normal
-    optimizer->setNoDecay();
-    return new OptimizerWithRegularizerEveryNumBatches(
-        optConfig, optimizer, regularizer);
-  }
-  if (isParameterSparse) {
-    CHECK(paraConfig.momentum() == 0.0f)
-        << "Parameter cannot support momentum if it's sparse.";
-    optimizer->setNoDecay();
-    return new OptimizerWithRegularizerSparse(
-        optConfig, optimizer, regularizer);
-  }
-  // dense
-  if (paraConfig.decay_rate_l1() == 0.0f ||
-      dynamic_cast<AddOptimizer*>(optimizer)) {
-    return optimizer;
-  }
-  CHECK(paraConfig.momentum() == 0.0f)
-      << "Parameter cannot support momentum if it use L1 decay.";
-  optimizer->setNoDecay();
-  return new OptimizerWithRegularizer(optConfig, optimizer, regularizer);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/OptimizerWithRegularizer.h b/paddle/legacy/parameter/OptimizerWithRegularizer.h
deleted file mode 100644
index bd29b3966..000000000
--- a/paddle/legacy/parameter/OptimizerWithRegularizer.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "FirstOrderOptimizer.h"
-
-namespace paddle {
-
-// add regularizer for objective function to do optimization
-class OptimizerWithRegularizer : public ParameterOptimizer {
- public:
-  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
-                                    const ParameterConfig& paraConfig,
-                                    bool isParameterSparse,
-                                    bool inPserver);
-
-  OptimizerWithRegularizer(const OptimizationConfig& optConfig,
-                           ParameterOptimizer* optimizer,
-                           Regularizer* regularizer)
-      : ParameterOptimizer(optConfig),
-        optimizer_(optimizer),
-        regularizer_(regularizer) {
-    parameterTypes_ = optimizer_->getParameterTypes();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    optimizer_->init(numRows, config);
-  }
-
-  virtual void startPass() {
-    optimizer_->startPass();
-    timer_ = 0;
-  }
-
-  virtual void finishPass() { optimizer_->finishPass(); }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    optimizer_->startBatch(numSamplesProcessed);
-  }
-
-  virtual void finishBatch() {
-    optimizer_->finishBatch();
-    ++timer_;
-  }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const {
-    return optimizer_->needSpecialTraversal(config);
-  }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const {
-    optimizer_->update(vecs, config, sparseId);
-    regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
-  }
-
- protected:
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-  Regularizer* regularizer_;
-
-  /**
-   *  counting batches, clear after catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int timer_;
-};
-
-// Regularized Loss function for every num of batches
-class OptimizerWithRegularizerEveryNumBatches
-    : public OptimizerWithRegularizer {
- public:
-  OptimizerWithRegularizerEveryNumBatches(const OptimizationConfig& optConfig,
-                                          ParameterOptimizer* optimizer,
-                                          Regularizer* regularizer)
-      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
-
-  virtual void startPass() {
-    OptimizerWithRegularizer::startPass();
-    baseTimer_ = 0;
-  }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const {
-    optimizer_->update(vecs, config, sparseId);
-  }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-  void doTraversal(const VectorPtr vecs[], const ParameterConfig& config) const;
-
-  void catchUpWith(const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) const;
-
-  virtual TraverseCallback startCatchUpWith() const;
-  virtual void finishCatchUpWith() { baseTimer_ = timer_; }
-
- protected:
-  bool isRegularizationBatch(const ParameterConfig& config) const {
-    return ((timer_ + 1) % config.num_batches_regularization() == 0);
-  }
-
-  /**
-   *  recored the timer_ value while catchUpWith called.
-   */
-  int baseTimer_;
-};
-
-// Regularized Loss function with Sparse support
-class OptimizerWithRegularizerSparse : public OptimizerWithRegularizer {
- public:
-  OptimizerWithRegularizerSparse(const OptimizationConfig& optConfig,
-                                 ParameterOptimizer* optimizer,
-                                 Regularizer* regularizer)
-      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
-
-  virtual void init(size_t numRows, const ParameterConfig* config);
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) const;
-  virtual TraverseCallback startCatchUpWith() const;
-  virtual void finishCatchUpWith() {
-    timer_ = 0;
-    t0Vec_.assign(t0Vec_.size(), 0);
-  }
-
- protected:
-  /**
-   *  t0Vec_ are last occur time of i rows
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  mutable std::vector<int32_t> t0Vec_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Parameter.cpp b/paddle/legacy/parameter/Parameter.cpp
deleted file mode 100644
index 666d808f0..000000000
--- a/paddle/legacy/parameter/Parameter.cpp
+++ /dev/null
@@ -1,425 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Parameter.h"
-#include <gflags/gflags.h>
-#include <fstream>
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerFunctions.h"
-#include "OptimizerWithRegularizer.h"
-#include "ParameterUpdateFunctions.h"
-#include "ThreadLocalBuffer.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/math/CpuSparseMatrix.h"
-#include "paddle/legacy/math/MathUtils.h"
-#include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/legacy/utils/Logging.h"
-
-DEFINE_int32(enable_grad_share,
-             (100 * 1024 * 1024),
-             "threshold for enable gradient parameter share for batch "
-             "multi-cpu training");
-DEFINE_int32(
-    grad_share_block_num,
-    64,
-    "block number of gradient parameter share for batch multi-cpu training");
-
-namespace paddle {
-
-const std::string Parameter::kMissParameterFail = "fail";
-const std::string Parameter::kMissParameterRand = "rand";
-const std::string Parameter::kMissParameterZero = "zero";
-
-Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(-1),
-      sharedCount_(0),
-      updateCounter_(0),
-      updated_(false),
-      headerFormat_(PARAM_FORMAT_ORIGINAL) {
-  setID(-1); /* capture uninitialized id */
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  if (doInit) {
-    initialize();
-  }
-
-  for (int i = 0; i < config.update_hooks_size(); ++i) {
-    this->updaterHooks_.push_back(IParameterUpdaterHook::create(config, i));
-  }
-}
-
-void Parameter::initialize() {
-  SetDevice device(deviceId_);
-
-  bufs_[PARAMETER_VALUE] =
-      Vector::createParallelVector(config_.size(), useGpu_);
-  bufs_[PARAMETER_VALUE]->zeroMem();
-
-  if (config_.is_sparse()) {
-    enableSparseParameter();
-  }
-
-  if (!isStatic()) {
-    bufs_[PARAMETER_GRADIENT] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[PARAMETER_MOMENTUM] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-
-    bufs_[PARAMETER_GRADIENT]->zeroMem();
-    bufs_[PARAMETER_MOMENTUM]->zeroMem();
-  }
-}
-
-void Parameter::randomize(const VectorPtr& value,
-                          const ParameterConfig& config) {
-  if (PARAMETER_INIT_UNIFORM == config.initial_strategy()) {
-    // initialize the parameter as uniform distribution
-    real initial_min = config.initial_mean() - config.initial_std();
-    real initial_max = config.initial_mean() + config.initial_std();
-    value->uniform(initial_min, initial_max);
-    VLOG(1) << config.name() << ": initial_min=" << initial_min
-            << ", initial_max=" << initial_max;
-  } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
-    /* Initialize the parameters randomly */
-    value->randnorm(config.initial_mean(), config.initial_std());
-    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
-            << ", initial_std=" << config.initial_std();
-  } else {
-    LOG(FATAL) << "not supported initial_strategy: "
-               << config.initial_strategy();
-  }
-}
-
-void Parameter::randomize() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  SetDevice device(deviceId_);
-  Parameter::randomize(bufs_[PARAMETER_VALUE], config_);
-
-  if (config_.is_sparse()) {
-    if (format_ == SPARSE_CSC) {
-      sparseRand(intBufs_[PARAMETER_COLS]->getData(),
-                 intBufs_[PARAMETER_ROWS]->getData(),
-                 config_.size(),
-                 config_.dims(1) + 1,
-                 config_.dims(0),
-                 useGpu_);
-    } else {
-      sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
-                 intBufs_[PARAMETER_COLS]->getData(),
-                 config_.size(),
-                 config_.dims(0) + 1,
-                 config_.dims(1),
-                 useGpu_);
-    }
-  }
-  setValueUpdated();
-}
-
-void Parameter::zeroMem() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  bufs_[PARAMETER_VALUE]->zeroMem();
-  setValueUpdated();
-  LOG(INFO) << getName() << " set to 0";
-}
-
-bool Parameter::isGradShared(size_t* blockNum) {
-  if (!useGpu_ && !isStatic() && FLAGS_enable_grad_share > 0 &&
-      !isGradSparseUpdate() &&
-      this->getSize() > (size_t)FLAGS_enable_grad_share) {
-    if (blockNum) {
-      *blockNum = (size_t)FLAGS_grad_share_block_num;
-    }
-    return true;
-  }
-  return false;
-}
-
-bool Parameter::isValueShared() {
-  return !useGpu_ && config_.is_shared() && FLAGS_trainer_count > 1;
-}
-
-bool Parameter::isGradSparseUpdate() const {
-  return !useGpu_ && !isStatic() &&
-         (config_.sparse_update() || config_.sparse_remote_update());
-}
-
-void Parameter::setMat(ParameterType pType, int matType) {
-  CHECK(!mats_[pType]);
-
-  if (config_.dims_size() == 0 && matType == MAT_NORMAL) {
-    return;
-  }
-
-  CHECK_EQ((size_t)config_.dims_size(), 2LU);
-  size_t height = config_.dims(0);
-  size_t width = config_.dims(1);
-  if (matType == MAT_NORMAL) {
-    if (!config_.is_sparse()) {
-      CHECK_EQ(height * width, bufs_[pType]->getSize());
-      mats_[pType] =
-          Matrix::create(bufs_[pType]->getMemoryHandle(), height, width);
-    } else {
-      size_t size = bufs_[pType]->getSize();
-      CHECK_GE(height * width, size);
-      if (format_ == SPARSE_CSR) {
-        CHECK_EQ(height + 1, intBufs_[PARAMETER_ROWS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_COLS]->getSize());
-      } else {
-        CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
-      }
-      mats_[pType] =
-          Matrix::createSparseMatrix(bufs_[pType]->getData(),
-                                     intBufs_[PARAMETER_ROWS]->getData(),
-                                     intBufs_[PARAMETER_COLS]->getData(),
-                                     height,
-                                     width,
-                                     bufs_[pType]->getSize(),
-                                     FLOAT_VALUE,
-                                     format_,
-                                     false,
-                                     useGpu_);
-    }
-  }
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOLINTNEXTLINE
-  else if (matType == MAT_NORMAL_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    size_t blockNum = 0;
-    CHECK(isGradShared(&blockNum));
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        blockNum,
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_VALUE_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW_IDS) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW) {
-    auto valueMat =
-        std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
-    SparseRowCpuMatrix::IndexDictPtr indexDict(nullptr);
-    if (pType != PARAMETER_VALUE) {
-      CHECK(valueMat) << "The matrix for PARAMETER_VALUE must be set "
-                      << " and its type must be MAT_SPARSE_ROW,"
-                      << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
-      indexDict = valueMat->getIndexDictHandle();
-    }
-    auto mat =
-        std::make_shared<SparseRowCpuMatrix>(nullptr,
-                                             height,
-                                             width,
-                                             // grad share index with value
-                                             indexDict);
-    mats_[pType] = mat;
-  } else if (matType == MAT_CACHE_ROW) {
-    CHECK(isGradSparseUpdate());
-    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-             matType == MAT_SPARSE_ROW_PREFETCH) {
-    auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
-        bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
-                           bufs_[pType]->getMemoryHandle())
-                     : nullptr,
-        height,
-        width,
-        nullptr,  // indexDictHandle
-        getGlobalSyncThreadPool());
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
-    CHECK(isGradSparseUpdate());
-    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  }
-#endif
-  // NOLINTNEXTLINE
-  else {
-    LOG(FATAL) << "Unsupported mat type" << matType;
-  }
-}
-
-void Parameter::incUpdate(const UpdateCallback& callback) {
-  // Static parameter is fixed, and does not need to be updated
-  if (isStatic()) {
-    return;
-  }
-
-  ++updateCounter_;
-  if (isUpdatable()) {
-    if (callback) callback(this);
-    clearUpdate();
-  }
-}
-
-bool Parameter::save(const std::string& filename) const {
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-  return save(fs);
-}
-
-bool Parameter::save(std::ostream& s) const {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  header.format = headerFormat_;
-  header.valueSize = sizeof(real);
-  header.size = getSize();
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(s.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter " << getName();
-
-  CHECK(s.write(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)))
-      << "Fail to write parameter " << getName();
-  if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    CHECK(s.write(reinterpret_cast<char*>(rows.getData()),
-                  rows.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-    CHECK(s.write(reinterpret_cast<char*>(cols.getData()),
-                  cols.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-  }
-
-  return true;
-}
-
-/**
- * Load parameter value from a file
- */
-bool Parameter::load(const std::string& filename) {
-  std::ifstream fs(filename, std::ios_base::binary);
-  if (!fs) {
-    LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
-    if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
-      LOG(FATAL) << getName() << " missing, not allowed.";
-      return false;
-    }
-    if (kMissParameterRand == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to random.";
-      randomize();
-      return true;
-    }
-    if (kMissParameterZero == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to zero.";
-      zeroMem();
-      return true;
-    }
-    LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
-               << FLAGS_load_missing_parameter_strategy;
-    return false;
-  }
-  return load(fs);
-}
-
-bool Parameter::load(std::istream& s) {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameter " << getName();
-  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
-                                                << header.format;
-  headerFormat_ = header.format;
-  CHECK_EQ(header.size, getSize())
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << getSize() << ") of the parameter: " << getName();
-  CHECK_EQ(header.valueSize, sizeof(real))
-      << "Unsupported valueSize " << header.valueSize << " at: " << getName();
-  CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
-               header.size * sizeof(real)));
-
-  auto& tmp = *bufs_[PARAMETER_VALUE].get();
-  if (typeid(tmp) == typeid(GpuVector)) {
-    bufs_[PARAMETER_VALUE]->copyFrom(vec);
-  }
-
-  if (config_.is_sparse() && config_.need_compact()) {
-    // load from dense parameter with many zero
-    CHECK_EQ(config_.dims_size(), 2);
-    auto height = config_.dims(0);
-    auto width = config_.dims(1);
-    auto mat = Matrix::create(vec.getData(), height, width);
-    CpuSparseMatrix sparseMat(height,
-                              width,
-                              0,
-                              FLOAT_VALUE,
-                              format_,
-                              /*trans*/ false);
-    sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
-    auto nnz = sparseMat.getElementCnt();
-    size_t rowSize = (format_ == SPARSE_CSR) ? height + 1 : nnz;
-    size_t colSize = (format_ == SPARSE_CSR) ? nnz : width + 1;
-
-    intBufs_[PARAMETER_ROWS]->copyFrom(sparseMat.getRows(), rowSize);
-    intBufs_[PARAMETER_COLS]->copyFrom(sparseMat.getCols(), colSize);
-    bufs_[PARAMETER_VALUE]->resize(nnz);  // for setMat check
-    bufs_[PARAMETER_VALUE]->copyFrom(sparseMat.getValue(), nnz);
-    config_.set_size(nnz);
-    LOG(INFO) << "compact nnz=" << (1. * nnz / (height * width))
-              << " name=" << config_.name();
-  } else if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    size_t rowSize, colSize;
-    CHECK_EQ(config_.dims_size(), 2);
-    if (format_ == SPARSE_CSR) {
-      rowSize = config_.dims(0) + 1;
-      colSize = config_.size();
-    } else {
-      rowSize = config_.size();
-      colSize = config_.dims(1) + 1;
-    }
-    CHECK(
-        s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
-    CHECK(
-        s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
-    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
-    if (typeid(paramRows) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_ROWS]->copyFrom(rows);
-    }
-    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
-    if (typeid(paramCols) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_COLS]->copyFrom(cols);
-    }
-  }
-
-  setValueUpdated();
-
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Parameter.h b/paddle/legacy/parameter/Parameter.h
deleted file mode 100644
index 43b567dad..000000000
--- a/paddle/legacy/parameter/Parameter.h
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "ParameterConfig.pb.h"
-#include "TrainerConfig.pb.h"
-
-#include "ParameterUpdaterHook.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-typedef enum {
-  /// The paddle original basic format
-  PARAM_FORMAT_ORIGINAL = 0,
-
-  /// See mkldnn_memory_format_t in
-  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
-  /// for a detailed description.
-  /// 2D weights tensor in the format (output channels, input channels).
-  PARAM_FORMAT_MKLDNN_OI,
-
-  /// The total format items numbers
-  PARAM_FORMAT_ITEMS,
-} PARAM_FORMAT;
-
-class SparsePrefetchRowCpuMatrix;
-
-class Parameter;
-typedef std::function<void(Parameter* param)> UpdateCallback;
-typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
-
-class Parameter;
-typedef std::shared_ptr<Parameter> ParameterPtr;
-
-class Parameter {
- public:
-  Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
-  const std::string& getName() const { return config_.name(); }
-
-  size_t getSize() const { return config_.size(); }
-
-  bool isFullSize() const {
-    if (bufs_[PARAMETER_VALUE]) {
-      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
-    }
-    return false;
-  }
-
-  inline bool useGpu() const { return useGpu_; }
-
-  int getDeviceId() const { return deviceId_; }
-
-  void setDevice(int deviceId) { deviceId_ = deviceId; }
-
-  /// The id ranges from 0 to the_total_number_of_parameters - 1
-  size_t getID() const { return config_.para_id(); }
-
-  /// ID is a implict value created until neural network is built.
-  void setID(size_t id) { config_.set_para_id(id); }
-
-  bool isStatic() const { return config_.is_static(); }
-
-  enum MatType {
-    MAT_NORMAL,
-    /// both value and grad are shared
-    MAT_NORMAL_SHARED,
-
-    /// Now used in BatchNorm in CPU mode
-    MAT_VALUE_SHARED,
-
-    /// sparse matrix, which has full size parameter
-    MAT_SPARSE_ROW_IDS,
-    /// sparse matrix, parameter size scale by sparse rates.
-    MAT_SPARSE_ROW_AUTO_GROW,
-    MAT_CACHE_ROW,
-    MAT_SPARSE_ROW,
-
-    /// sparse matrix for prefetching parameter from pserver
-    MAT_SPARSE_ROW_PREFETCH,
-    /// same as above, but parameter has full size for saving parameter in local
-    MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
-  };
-
-  void enableSparseParameter() {
-    if (config_.is_sparse()) {
-      if (config_.format() == "csr") {
-        size_t height = config_.dims(0);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_ROWS, height + 1);
-        enableIntType(PARAMETER_COLS, nnz);
-        format_ = SPARSE_CSR;
-      } else {
-        size_t width = config_.dims(1);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_COLS, width + 1);
-        enableIntType(PARAMETER_ROWS, nnz);
-        format_ = SPARSE_CSC;
-      }
-    }
-  }
-
-  /// allocate buffer for the give type
-  void enableType(ParameterType type, MatType matType = MAT_NORMAL) {
-    if (bufs_[type] || mats_[type]) {
-      return;
-    }
-    SetDevice device(deviceId_);
-    if (config_.dims_size() == 2) {
-      if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
-          matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
-        bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-        bufs_[type]->zeroMem();
-      } else {
-        CHECK(isGradSparseUpdate());
-      }
-      if (config_.is_sparse() && type == PARAMETER_VALUE) {
-        enableSparseParameter();
-      }
-      setMat(type, matType);
-    } else {
-      bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-      bufs_[type]->zeroMem();
-    }
-  }
-
-  void enableBufType(ParameterType type) {
-    if (bufs_[type]) return;
-    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[type]->zeroMem();
-  }
-
-  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
-    if (!intBufs_[type]) {
-      SetDevice device(deviceId_);
-      size_t size = intStoreSize ? intStoreSize : config_.size();
-      intBufs_[type] = IVector::create(size, useGpu_);
-      intBufs_[type]->zeroMem();
-    }
-  }
-
-  void enableSharedType(ParameterType type,
-                        VectorPtr vec,
-                        MatrixPtr mat = nullptr) {
-    if (!bufs_[type] && !mats_[type]) {
-      bufs_[type] = vec;
-      mats_[type] = mat;
-    }
-  }
-
-  /// for batchGradientMachine: blockNum is number of partitions of the matrix.
-  bool isGradShared(size_t* blockNum = NULL);
-
-  bool isValueShared();
-
-  // for AsgdSparseGradientMachine & SgdSparseGradientMachine:
-  // and MultiGradientMachine
-  bool isGradSparseUpdate() const;
-
-  bool isSparseRemoteUpdate() const {
-    return config_.sparse_remote_update() && !useGpu();
-  }
-
-  const ParameterConfig& getConfig() const { return config_; }
-
-  ParameterConfig& getConfig() { return config_; }
-
-  bool hasType(ParameterType pType) const {
-    return bufs_[pType] || mats_[pType];
-  }
-
-  const VectorPtr& getBuf(ParameterType pType) const {
-    return this->bufs_[pType];
-  }
-
-  const VectorPtr* getBufs() const { return bufs_; }
-
-  const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
-
-  void setValueUpdated() { updated_ = true; }
-
-  void clearValueUpdated() { updated_ = false; }
-
-  bool isValueUpdated() const { return updated_; }
-
-  /**
-   * Save parameter value to a file
-   */
-  bool save(const std::string& filename) const;
-
-  /**
-   * Save parameter to ostream
-   */
-  bool save(std::ostream& s) const;
-
-  /**
-   * Load parameter value from a file
-   */
-  bool load(const std::string& filename);
-
-  /**
-   * Load parameter from istream
-   */
-  bool load(std::istream& is);
-
-  void incShared() { sharedCount_++; }
-
-  /**
-   * After one of the parameter's gradient is merged
-   * You should call this function to do some additional processing,
-   */
-  void incUpdate(const UpdateCallback& callbacks = NULL);
-
-  void clearGradient() {
-    auto& mat = getMat(PARAMETER_GRADIENT);
-    if (mat) {
-      // zeroMem will also clear rows for SparseRowCpuMatrix
-      mat->zeroMem();
-    } else {
-      auto& gradBuf = getBuf(PARAMETER_GRADIENT);
-      if (gradBuf) gradBuf->zeroMem();
-    }
-  }
-
-  void initialize();
-
-  /**
-   * Initialize the value according to config_: initial_mean,
-   * initial_std and initial_strategy.
-   */
-  void randomize();
-  static void randomize(const VectorPtr& value, const ParameterConfig& config);
-
-  /// Initialize the value to 0
-  void zeroMem();
-
-  /// file header structure
-  struct Header {
-    int32_t format;      // = PARAM_FORMAT
-    uint32_t valueSize;  // = sizeof(real)
-    uint64_t size;       // = getSize()
-  };
-
-  /**
-   * @brief Is the header format supported.
-   */
-  static bool isHeaderFormatSupported(int32_t fmt) {
-    return fmt < PARAM_FORMAT_ITEMS;
-  }
-
-  /**
-   * @brief Get the format in header.
-   */
-  int getHeaderFormat() { return headerFormat_; }
-
-  /**
-   * @brief Set the format in header.
-   */
-  void setHeaderFormat(int32_t fmt) {
-    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
-                                        << fmt;
-    headerFormat_ = fmt;
-  }
-
-  /**
-   * @brief  Parameter Update Hook.
-   *
-   * The parameter's update hook before ParameterUpdater::updateImpl
-   * It could modify gradient/momentum/etc here. Such as drop some gradient,
-   * etc.
-   */
-  void updateHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->update(this);
-    }
-  }
-
-  /**
-   * @brief  Initialize all updater hook.
-   *
-   * This method should be invoked in ParameterUpdater::init() only.
-   */
-  void initHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->init(this);
-    }
-  }
-
- protected:
-  /**
-   * @brief create matrix to matType.
-   *
-   * used by gradient machine which needs specify matrix type,
-   * instead of creating in weights.cpp.
-   *
-   * @note  pType should be enabled already.
-   */
-  void setMat(ParameterType pType, int matType);
-
-  bool isUpdatable() { return (updateCounter_ == sharedCount_); }
-
-  void clearUpdate() { updateCounter_ = 0; }
-
- protected:
-  ParameterConfig config_;
-
-  bool useGpu_;
-
-  int deviceId_;
-
-  /**
-   * @brief bufs_ stores parameter value and gradient.
-   *
-   * Layer should use bufs_[PARAMETER_VALUE] to form weight matrix for
-   * calculation and stores gradient to bufs_[PARAMETER_GRADIENT].
-   */
-  VectorPtr bufs_[NUM_PARAMETER_TYPES];
-
-  /**
-   * @brief Weight matrix for bufs_.
-   *
-   * It's helpfull when parameter shared by multi-layers.
-   * Caller should check, if mats exist, do not create it again.
-   */
-  MatrixPtr mats_[NUM_PARAMETER_TYPES];
-
-  /// Int vectors, used in some User defined parameter types
-  IVectorPtr intBufs_[NUM_PARAMETER_TYPES];
-
-  int sharedCount_;
-  int updateCounter_;
-
-  bool updated_;
-  SparseFormat format_;
-
-  /// The header format for saving or loading param
-  int32_t headerFormat_;
-
-  std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
-
- public:
-  void setSharedCount(int cnt) { sharedCount_ = cnt; }
-  int getSharedCount() { return sharedCount_; }
-
-  bool isSparse() { return config_.is_sparse(); }
-  SparseFormat getFormat() { return format_; }
-
-  static const std::string kMissParameterFail;
-  static const std::string kMissParameterRand;
-  static const std::string kMissParameterZero;
-};
-
-typedef std::map<std::string, ParameterPtr> ParameterMap;
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterOptimizer.cpp b/paddle/legacy/parameter/ParameterOptimizer.cpp
deleted file mode 100644
index b9dffa5af..000000000
--- a/paddle/legacy/parameter/ParameterOptimizer.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include <fstream>
-
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerFunctions.h"
-#include "OptimizerWithRegularizer.h"
-#include "ParameterOptimizer.h"
-#include "hl_gpu.h"
-
-namespace paddle {
-
-ParameterOptimizer* ParameterOptimizer::create(
-    const OptimizationConfig& optConfig, bool inPserver) {
-  if (inPserver && optConfig.num_batches_per_send_parameter() > 1) {
-    return new AddOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "momentum") {
-    return new SgdOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "torch_momentum") {
-    return new SgdOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adagrad") {
-    return new AdagradParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adadelta") {
-    return new AdaDeltaParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "rmsprop") {
-    return new RMSPropParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "decayed_adagrad") {
-    return new DecayedAdagradParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adam") {
-    return new AdamParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adamax") {
-    return new AdamaxParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "sparse_momentum") {
-    return new SparseMomentumParameterOptimizer(optConfig);
-  }
-  return nullptr;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterOptimizer.h b/paddle/legacy/parameter/ParameterOptimizer.h
deleted file mode 100644
index 019afa135..000000000
--- a/paddle/legacy/parameter/ParameterOptimizer.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "LearningRateScheduler.h"
-#include "Parameter.h"
-
-namespace paddle {
-
-/**
- * Some member functions are set to const for two reasons:
- *
- * 1. For sparse update thread safe: update(), traverse callback(const this)
- *    may be called many times, each time one row, and these function
- *    can be called parallelly by multi worker, to speed up large block.
- *
- * 2. For predicate functions, needSpecialTraversal(), startCatchUpWith()
- *    may be called many times, should be no state change between calls.
- */
-class ParameterOptimizer {
- public:
-  typedef std::function<void(
-      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
-      TraverseCallback;
-
- public:
-  explicit ParameterOptimizer(const OptimizationConfig& optConfig)
-      : applyDecay_(true),
-        optConfig_(optConfig),
-        parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT},
-        learningRate_(optConfig.learning_rate()),
-        learningRateScheduler_(LearningRateScheduler::create(optConfig)),
-        pass_(0),
-        firstTime_(true) {}
-
-  real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRateScheduler_->calcLearningRate(numSamplesProcessed, pass);
-  }
-
-  virtual ~ParameterOptimizer() {}
-
-  /**
-   * For sparse update, optimizer can maintain numRows of timer(t0).
-   * Some sparse optimizer depends on parameter config in functions
-   * such as startBatch(). Optimizer can get it here. But notice that,
-   * not all callers can pass config here, so the optimizer should check
-   * config passed in is not null ptr.
-   */
-  virtual void init(size_t numRows, const ParameterConfig* config) {}
-
-  virtual void startPass() {}
-  virtual void finishPass() { ++pass_; }
-
-  /// called by Trainer before forward() of a batch.
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    (void)numSamplesProcessed;
-  }
-
-  /**
-   * following hooks useful for sparse update,
-   * because the traversal in block costs.
-   * called by Trainer after update and before finishBatch
-   * e.g. Trainer call like this:
-   *
-   * @code
-   * startBatch();
-   * if (dense) {
-   *   update(blockVec);
-   * } else {//sparse
-   *   for (row : rows_in_block) {update(rowVec)}
-   * }
-   * auto callback = needSpecialTraversal();
-   * if (callback) {
-   *   // do traverse, maybe multi-thread
-   *   if (dense) {
-   *     callback();
-   *   } else {//sparse
-   *     for (row : all_rows_in_block) {callback();}
-   *   }
-   * }
-   * finishBatch();
-   * @endcode
-   *
-   * @return callback if need traverse,
-   *         else return nullptr.
-   *         It should be no state change.
-   */
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const {
-    return nullptr;
-  }
-
-  /// called by Trainer after backward() of a batch
-  virtual void finishBatch() {}
-
-  /**
-   * between startBatch() and finishBatch(), update() will be called
-   * by the trainer multiple times, each time for updating one Parameter
-   * with its gradient in PARAMETER_GRADIENT. sparseId is row id,
-   * when sparseId set, update is sparse, each time one row.
-   */
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId = -1LU) const = 0;
-
-  /**
-   * following hooks catch up with current time for sparse update,
-   * In the beginning, call startCatchUpWith() and check return.
-   * In the end, call finishCatchUpWith() to finish state.
-   * callback do the actual works, can call many times for sparse data.
-   * e.g. Trainer call like this:
-   *
-   * @code
-   * auto callback = startCatchUpWith();
-   * if (callback) {
-   *   // do catch up with, maybe multi-thread
-   *   if (dense) {
-   *     callback();
-   *   } else {//sparse
-   *     for (row : rows_in_block) {callback();}
-   *   }
-   *   // finish catch up with, main thread
-   *   finishCatchUpWith();
-   * }
-   * @endcode
-   *
-   * @return callback if need catch up with,
-   *         else return nullptr.
-   *         It should be no state change.
-   */
-  virtual TraverseCallback startCatchUpWith() const { return nullptr; }
-  virtual void finishCatchUpWith() {}
-
-  /**
-   * following two hooks used by averager,
-   * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
-   *
-   * restore() will restore orginal value if it apply to PARAMETER_VALUE.
-   * Caller must ensure it's catched up with current time before apply.
-   *
-   * Use returned callback same way as callback returned by
-   * ParameterOptimizer::needSpecialTraversal()
-   */
-  virtual TraverseCallback apply() { return nullptr; }
-  virtual TraverseCallback restore() { return nullptr; }
-
-  /// return the parameter types used by this updater
-  const std::vector<ParameterType>& getParameterTypes() const {
-    return parameterTypes_;
-  }
-
-  void addParameterType(ParameterType type) {
-    for (auto t : parameterTypes_) {
-      if (t == type) return;
-    }
-    parameterTypes_.push_back(type);
-  }
-
-  real getLearningRate() const { return learningRate_; }
-
-  virtual void setNoDecay() { applyDecay_ = false; }
-
-  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
-                                    bool inPserver = false);
-
- protected:
-  typedef std::vector<ParameterOptimizer::TraverseCallback> TraverseCallbackVec;
-
-  static TraverseCallback composeCallbacks(
-      const TraverseCallbackVec& callbacks) {
-    if (callbacks.size() > 1LU) {
-      return [callbacks](const VectorPtr vecs[],
-                         const ParameterConfig& config,
-                         size_t sparseId) {
-        for (auto callback : callbacks) {
-          callback(vecs, config, sparseId);
-        }
-      };
-    }
-    return (callbacks.size() == 1LU) ? callbacks[0] : nullptr;
-  }
-
-  bool applyDecay_;
-  const OptimizationConfig& optConfig_;
-  std::vector<ParameterType> parameterTypes_;
-
-  /**
-   * global learning rate, init value is opt_config.learning_rate,
-   * sparse regularizer get this value per batch, after StartBatch() called
-   * so, if lr change in StartBatch, please assign to learningRate_
-   */
-  real learningRate_;
-
-  std::unique_ptr<LearningRateScheduler> learningRateScheduler_;
-  int64_t pass_;  // current training pass (starting from 0)
-  bool firstTime_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.cpp b/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
deleted file mode 100644
index 72c9841ac..000000000
--- a/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Logging.h"
-#ifdef __AVX__
-#include <x86intrin.h>
-#include <xmmintrin.h>
-#endif
-
-#include "ParameterUpdateFunctions.h"
-
-namespace paddle {
-
-void sgdUpdateCpu(real learningRate,
-                  real momentum,
-                  real decayRate,
-                  size_t size,
-                  real* value,
-                  const real* grad,
-                  real* momentumVec) {
-  decayRate *= learningRate;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (size_t i = 0; i < size; ++i) {
-    momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
-                     decayRate * value[i];
-    value[i] += momentumVec[i];
-  }
-}
-
-void sgdUpdate(real learningRate,
-               real momentum,
-               real decayRate,
-               Vector* value,
-               Vector* grad,
-               Vector* momentumVec) {
-  size_t size = value->getSize();
-  real* val = value->getData();
-  real* grd = grad->getData();
-  real* mom = momentumVec->getData();
-  if (typeid(*value) == typeid(CpuVector)) {
-    sgdUpdateCpu(learningRate, momentum, decayRate, size, val, grd, mom);
-  } else if (typeid(*value) == typeid(GpuVector)) {
-    value->sgdUpdate(*grad, *momentumVec, learningRate, momentum, decayRate);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void sgdUpdateAvx(float learningRate,
-                  float momentum,
-                  float decayRate,
-                  size_t size,
-                  float* value,
-                  const float* _grad,
-                  float* momentumVec) {
-#ifdef __AVX__
-  float* grad = const_cast<float*>(_grad);  // the gradient is not modified
-                                            // but when invoke simd functions
-                                            // need non-const pointer.
-  size_t gradientAlign = 0;
-  size_t gradientAlignHeader = (size_t)grad % sizeof(__m256);
-  CHECK_EQ(gradientAlignHeader, (size_t)momentumVec % sizeof(__m256))
-      << "Gradent buffer didn't align with momentum buffer";
-  CHECK_EQ(gradientAlignHeader, (size_t)value % sizeof(__m256))
-      << "Gradent buffer didn't align with value buffer";
-  if (0 != gradientAlignHeader) {
-    gradientAlignHeader = sizeof(__m256) - gradientAlignHeader;
-    gradientAlign = gradientAlignHeader / sizeof(real);
-
-    // handle the unalign buffer
-    for (size_t i = 0; i < gradientAlign; i++) {
-      momentumVec[i] = momentum * momentumVec[i] - (learningRate * grad[i]) -
-                       (decayRate * learningRate * value[i]);
-      value[i] += momentumVec[i];
-    }
-    grad += gradientAlign;
-    momentumVec += gradientAlign;
-    value += gradientAlign;
-  }
-
-  constexpr size_t kParallelNum = 8;
-  constexpr size_t nStepSize = (sizeof(__m256) / sizeof(real)) * kParallelNum;
-  size_t cntLoop = (size - gradientAlign) / nStepSize;
-  size_t cntRem = (size - gradientAlign) % nStepSize;
-  __m256 gradientTmp[kParallelNum];
-  __m256 valueTmp[kParallelNum];
-  __m256 lr, mom, dr;
-  std::function<void(void)> loopFun;
-
-  learningRate *= -1;
-  lr = _mm256_set_ps(learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate);
-
-  if (0 != momentum) {
-    mom = _mm256_set_ps(momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum);
-  }
-
-  decayRate *= learningRate;
-  if (0 != decayRate) {
-    dr = _mm256_set_ps(decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate);
-  }
-
-  auto gradMulFun = [&](void) {
-    gradientTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad), lr);
-    gradientTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 8), lr);
-    gradientTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 16), lr);
-    gradientTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 24), lr);
-    gradientTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 32), lr);
-    gradientTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 40), lr);
-    gradientTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 48), lr);
-    gradientTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 56), lr);
-  };
-
-  auto valueMulFun = [&](void) {
-    valueTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value), dr);
-    valueTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 8), dr);
-    valueTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 16), dr);
-    valueTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 24), dr);
-    valueTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 32), dr);
-    valueTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 40), dr);
-    valueTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 48), dr);
-    valueTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 56), dr);
-  };
-
-  auto momentumMulFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 8) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 8), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 16) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 16), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 24) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 24), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 32) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 32), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 40) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 40), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 48) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 48), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 56) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 56), mom);
-  };
-
-  auto momentumAddGradFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), gradientTmp[0]);
-    *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 8), gradientTmp[1]);
-    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 16), gradientTmp[2]);
-    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 24), gradientTmp[3]);
-    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 32), gradientTmp[4]);
-    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 40), gradientTmp[5]);
-    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 48), gradientTmp[6]);
-    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 56), gradientTmp[7]);
-  };
-
-  auto momentumZeroFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) = gradientTmp[0];
-    *reinterpret_cast<__m256*>(momentumVec + 8) = gradientTmp[1];
-    *reinterpret_cast<__m256*>(momentumVec + 16) = gradientTmp[2];
-    *reinterpret_cast<__m256*>(momentumVec + 24) = gradientTmp[3];
-    *reinterpret_cast<__m256*>(momentumVec + 32) = gradientTmp[4];
-    *reinterpret_cast<__m256*>(momentumVec + 40) = gradientTmp[5];
-    *reinterpret_cast<__m256*>(momentumVec + 48) = gradientTmp[6];
-    *reinterpret_cast<__m256*>(momentumVec + 56) = gradientTmp[7];
-  };
-
-  auto momentumAddValueFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), valueTmp[0]);
-    *reinterpret_cast<__m256*>(momentumVec + 8) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec + 8), valueTmp[1]);
-    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 16), valueTmp[2]);
-    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 24), valueTmp[3]);
-    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 32), valueTmp[4]);
-    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 40), valueTmp[5]);
-    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 48), valueTmp[6]);
-    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 56), valueTmp[7]);
-  };
-
-  auto valueAddMomentumFun = [&](void) {
-    *reinterpret_cast<__m256*>(value) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value),
-                      *reinterpret_cast<__m256*>(momentumVec));
-    *reinterpret_cast<__m256*>(value + 8) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 8),
-                      *reinterpret_cast<__m256*>(momentumVec + 8));
-    *reinterpret_cast<__m256*>(value + 16) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 16),
-                      *reinterpret_cast<__m256*>(momentumVec + 16));
-    *reinterpret_cast<__m256*>(value + 24) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 24),
-                      *reinterpret_cast<__m256*>(momentumVec + 24));
-    *reinterpret_cast<__m256*>(value + 32) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 32),
-                      *reinterpret_cast<__m256*>(momentumVec + 32));
-    *reinterpret_cast<__m256*>(value + 40) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 40),
-                      *reinterpret_cast<__m256*>(momentumVec + 40));
-    *reinterpret_cast<__m256*>(value + 48) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 48),
-                      *reinterpret_cast<__m256*>(momentumVec + 48));
-    *reinterpret_cast<__m256*>(value + 56) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 56),
-                      *reinterpret_cast<__m256*>(momentumVec + 56));
-  };
-
-  if (0 == decayRate && 0 == momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      momentumZeroFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 == decayRate && 0 != momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      momentumMulFun();
-      momentumAddGradFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 != decayRate && 0 == momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      valueMulFun();
-      momentumZeroFun();
-      momentumAddValueFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 != decayRate && 0 != momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      valueMulFun();
-      momentumMulFun();
-      momentumAddGradFun();
-      momentumAddValueFun();
-      valueAddMomentumFun();
-    };
-  }
-
-  for (size_t i = 0; i < cntLoop; i++) {
-    loopFun();
-    grad += nStepSize;
-    momentumVec += nStepSize;
-    value += nStepSize;
-  }
-
-  for (size_t i = 0; i < cntRem; i++) {
-    momentumVec[i] = momentum * momentumVec[i] + (learningRate * grad[i]) +
-                     (decayRate * value[i]);
-    value[i] += momentumVec[i];
-  }
-#endif
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.h b/paddle/legacy/parameter/ParameterUpdateFunctions.h
deleted file mode 100644
index a7cc1c4c4..000000000
--- a/paddle/legacy/parameter/ParameterUpdateFunctions.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Common.h"
-
-namespace paddle {
-
-/**
- * Performs the following operations.
- *
- * momentumVec = momentum * momentumVec
- *               - learningRate * grad
- *               - learningRate * decayRate * value
- *
- * value = value + momentumVec
- * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
- * computation.
- */
-void sgdUpdate(real learningRate,
-               real momentum,
-               real decayRate,
-               Vector* value,
-               Vector* grad,
-               Vector* momentumVec);
-
-void sgdUpdateCpu(real learningRate,
-                  real momentum,
-                  real decayRate,
-                  size_t size,
-                  real* value,
-                  const real* grad,
-                  real* momentumVec);
-
-void sgdUpdateAvx(float learningRate,
-                  float momentum,
-                  float decayRate,
-                  size_t size,
-                  float* value,
-                  const float* grad,
-                  float* momentumVec);
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterBase.cpp b/paddle/legacy/parameter/ParameterUpdaterBase.cpp
deleted file mode 100644
index 7d9d3fad6..000000000
--- a/paddle/legacy/parameter/ParameterUpdaterBase.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdaterBase.h"
-#include <fstream>
-#include "hl_gpu.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-void ParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  parameters_ = parameters;
-  for (ParameterType type : getParameterTypes()) {
-    for (auto& para : parameters) {
-      para->enableType(type);
-    }
-  }
-  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
-    nonStaticParaIDMap_.insert(
-        std::pair<size_t, size_t>(parameters_[pid]->getID(), pid));
-  }
-
-  for (auto& para : parameters) {
-    if (!para->isStatic()) {
-      para->initHook();
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterBase.h b/paddle/legacy/parameter/ParameterUpdaterBase.h
deleted file mode 100644
index 493512886..000000000
--- a/paddle/legacy/parameter/ParameterUpdaterBase.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Parameter.h"
-
-namespace paddle {
-
-class ParameterOptimizer;
-
-class ParameterUpdater {
- public:
-  ParameterUpdater() : parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT} {}
-  virtual ~ParameterUpdater() {}
-
-  void addParameterType(ParameterType type) {
-    for (auto t : parameterTypes_) {
-      if (t == type) return;
-    }
-    parameterTypes_.push_back(type);
-  }
-
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  // called by Trainer when starting a new pass
-  virtual void startPass() {}
-
-  // called by Trainer then finishing a pass, ruturn true if pass accepted
-  virtual bool finishPass() { return true; }
-
-  // called by Trainer before backward() of a batch
-  // Return the type of pass it needs. This pass type will be passed
-  // to GradientMachine::forward() by the caller.
-  virtual PassType startBatch(int64_t batchSize) {
-    (void)batchSize;
-    return PASS_TRAIN;
-  }
-
-  // called by Trainer after backward() of a batch
-  // cost: the cost for this batch
-  virtual void finishBatch(real cost) { (void)cost; }
-
-  // between startBatch() and finishBatch(), update() will be called
-  // by the trainer multiple times, each time for updating one Parameter
-  // with its gradient in PARAMETER_GRADIENT
-  void update(Parameter* para) {
-    SetDevice setDevice(para->getDeviceId());
-    para->updateHook();
-    this->updateImpl(para);
-  }
-
-  // only get required sparse rows by default,
-  // get full matrix parameter if *fullSize* set
-  // get PARAMETER_APPLY on pserver if *apply* set
-  virtual void getParametersRemote(bool fullSize = false, bool apply = false) {}
-
-  virtual void loadParametersRemote(const std::string& dirName) {}
-  virtual void saveParametersRemote(const std::string& dirName) {}
-  virtual void randParametersRemote() {}
-
-  // something like regularization may be delayed apply
-  // trainer should catch up with before parameter is saved or sended.
-  virtual void catchUpWith() {}
-
-  // following two hooks used by averager
-  // apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
-  // restore() will restore orginal value if it apply to PARAMETER_VALUE.
-  virtual void apply() {}
-  virtual void restore() {}
-
-  // return the parameter types used by this updater
-  const std::vector<ParameterType>& getParameterTypes() const {
-    return parameterTypes_;
-  }
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {}
-#endif
-
- protected:
-  virtual void updateImpl(Parameter* para) = 0;
-
-  std::vector<ParameterType> parameterTypes_;
-  std::vector<ParameterPtr> parameters_;
-  std::map<size_t, size_t> nonStaticParaIDMap_;
-};
-
-// Composite of ParameterUpdaters, each ParameterUpdater handle
-// part of all Parameters. It's useful when we need different
-// update strategy for different Parameter.
-class ParameterUpdaterComposite : public ParameterUpdater {
- public:
-  ParameterUpdaterComposite() {}
-  virtual ~ParameterUpdaterComposite() {}
-
-  virtual void init(const std::vector<ParameterPtr>& parameters) = 0;
-
-  virtual void startPass() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->startPass(); });
-  }
-
-  virtual bool finishPass() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->finishPass(); });
-    return true;
-  }
-
-  virtual PassType startBatch(int64_t batchSize) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->startBatch(batchSize);
-    });
-    return PASS_TRAIN;
-  }
-
-  virtual void finishBatch(real cost) {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->finishBatch(cost); });
-  }
-
-  virtual void getParametersRemote(bool fullSize, bool apply) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->getParametersRemote(fullSize, apply);
-    });
-  }
-  virtual void loadParametersRemote(const std::string& dirName) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->loadParametersRemote(dirName);
-    });
-  }
-  virtual void saveParametersRemote(const std::string& dirName) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->saveParametersRemote(dirName);
-    });
-  }
-  virtual void randParametersRemote() {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->randParametersRemote();
-    });
-  }
-
-  virtual void catchUpWith() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->catchUpWith(); });
-  }
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    for (auto& updater : updaters_) {
-      updater->setForwardbackwardTime(delta);
-    }
-  }
-#endif
-
-  virtual void apply() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->apply(); });
-  }
-  virtual void restore() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->restore(); });
-  }
-
- protected:
-  virtual void updateImpl(Parameter* para) {}
-  std::vector<std::unique_ptr<ParameterUpdater>> updaters_;
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterHook.cpp b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
deleted file mode 100644
index bfb9769fb..000000000
--- a/paddle/legacy/parameter/ParameterUpdaterHook.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdaterHook.h"
-
-#include <algorithm>
-#include <atomic>
-#include <fstream>
-#include <mutex>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * The static pruning hook
- * Static means user specify a sparsity_ratio before training started, and the
- * network will prune the parameters based on the sparsity_ratio. More details
- * can be found https://arxiv.org/pdf/1506.02626.pdf.
- */
-
-class StaticPruningHook : public IParameterUpdaterHook {
- public:
-  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
-      : initCount_(0) {
-    sparsityRatio_ = hookConfig.sparsity_ratio();
-  }
-
-  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
-                             const std::pair<real, size_t> &pair2) {
-    return pair1.first > pair2.first;
-  }
-
-  void update(Parameter *para) {
-    updateThreadChecker_.check();
-    auto &vec = para->getBuf(PARAMETER_GRADIENT);
-    if (vec) {
-      vec->dotMul(*maskVec_);
-    }
-  }
-
-  void generateMask(Parameter *para) {
-    VectorPtr maskTemp = Vector::create(para->getSize(), false);
-    maskTemp->zeroMem();
-    real *maskTempData = maskTemp->getData();
-    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
-
-    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
-    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
-
-    paraCpuCopy->copyFrom(*paraVec);
-    std::vector<std::pair<real, size_t>> param;
-
-    for (size_t i = 0; i < para->getSize(); i++)
-      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
-
-    std::partial_sort(
-        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
-    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
-
-    // Currently just use a mask vector for hack.
-    if (para->useGpu()) {
-      maskVec_ = Vector::create(para->getSize(), para->useGpu());
-      maskVec_->copyFrom(*maskTemp);
-    } else {
-      maskVec_ = maskTemp;
-    }
-  }
-
-  void init(Parameter *para) {
-    generateMask(para);
-    size_t initCount = this->initCount_.fetch_add(1);
-    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
-                                "in same ParamterUpdater";
-    VLOG(3) << "Initialize Parameter " << para;
-    SetDevice device(para->getDeviceId());
-
-    auto &paraVec = para->getBuf(PARAMETER_VALUE);
-    paraVec->dotMul(*maskVec_);
-  }
-
- private:
-  SameThreadChecker updateThreadChecker_;
-  std::atomic<size_t> initCount_;
-  VectorPtr maskVec_;
-  real sparsityRatio_;
-};
-
-IParameterUpdaterHook::IParameterUpdaterHook() {}
-
-IParameterUpdaterHook::~IParameterUpdaterHook() {}
-
-/**
- * A Hasher used by g_hooks.
- *
- * Use the independent hasher intendedly. There is a hasher in PServer for hash
- * ParameterBlock. But not to use same hasher to reduce dependency.
- *
- * May be extracted to Util.h to unify the hasher.
- */
-class StringIntPairHasher {
- public:
-  size_t operator()(const std::pair<std::string, int> &k) const {
-    return intHasher_(strHasher_(k.first) + k.second);
-  }
-
- private:
-  std::hash<std::string> strHasher_;
-  std::hash<int> intHasher_;
-};
-
-static WeakKVCache<std::pair<std::string, int>,
-                   IParameterUpdaterHook,
-                   StringIntPairHasher>
-    g_hookCache_;
-
-/**
- * ParameterUpdaterHook actually factory method.
- */
-static IParameterUpdaterHook *createImpl(
-    const ParameterUpdaterHookConfig &config) {
-  auto &type = config.type();
-  if (type == "pruning") {
-    return new StaticPruningHook(config);
-  }
-
-  LOG(FATAL) << "Unknown Hook type:  " << type;
-  return nullptr;
-}
-
-std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
-    const ParameterConfig &paramConfig, int idx) {
-  std::pair<std::string, int> key = {paramConfig.name(), idx};
-  return g_hookCache_.get(
-      key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterHook.h b/paddle/legacy/parameter/ParameterUpdaterHook.h
deleted file mode 100644
index cb96e4cf0..000000000
--- a/paddle/legacy/parameter/ParameterUpdaterHook.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-
-#include "ParameterConfig.pb.h"
-
-namespace paddle {
-
-class Parameter;
-
-/**
- * The parameter updater hook interface.
- *
- * The Parameter Updater hooks is a group of methods invoke before
- * ParameterUpdater::updateImpl. It can modify gradient/momentum/etc before
- * parameter optimization.
- */
-class IParameterUpdaterHook {
- public:
-  virtual ~IParameterUpdaterHook();
-
-  /**
-   * Create A ParameterUpdaterHook.
-   *
-   * The same parameter shared the same hooks. So it returns shared_ptr.
-   *
-   * @param param_config The parameter config.
-   * @param idx  The element index of param_config.updater_hooks() array.
-   */
-  static std::shared_ptr<IParameterUpdaterHook> create(
-      const ParameterConfig& paramConfig, int idx);
-
-  /**
-   * The update hook method. Invoke before ParameterUpdater::updateImpl
-   */
-  virtual void update(Parameter* para) = 0;
-
-  /**
-   * The init hook method. Invoke in ParameterUpdater::init
-   */
-  virtual void init(Parameter* para) = 0;
-
- protected:
-  /**
-   * Ctor.
-   */
-  IParameterUpdaterHook();
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Regularizer.cpp b/paddle/legacy/parameter/Regularizer.cpp
deleted file mode 100644
index c1d5f4fa6..000000000
--- a/paddle/legacy/parameter/Regularizer.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Regularizer.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
-                              const ParameterConfig& paraConfig) {
-  bool useLearningRateVec =
-      std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) !=
-      types.end();
-  if (paraConfig.decay_rate_l1() > 0.0f &&
-      paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
-    if (useLearningRateVec) {
-      static L1L2LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L1L2Regularizer regularizer_;
-    return &regularizer_;
-  }
-  if (paraConfig.decay_rate_l1() > 0.0f) {  // use L1 only
-    if (useLearningRateVec) {
-      static L1LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L1Regularizer regularizer_;
-    return &regularizer_;
-  }
-  if (paraConfig.decay_rate() > 0.0f) {  // use L2 only
-    if (useLearningRateVec) {
-      static L2LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L2Regularizer regularizer_;
-    return &regularizer_;
-  }
-  return nullptr;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Regularizer.h b/paddle/legacy/parameter/Regularizer.h
deleted file mode 100644
index fa5384e23..000000000
--- a/paddle/legacy/parameter/Regularizer.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterUpdaterBase.h"
-
-namespace paddle {
-
-// Regularizer function for parameter, e.g. L1/L2
-class Regularizer {
- public:
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,  // learningrate from optimizer
-                      int t0,             // last occurence time
-                      int t) const = 0;   // current time
-  virtual ~Regularizer() {}
-
-  static Regularizer* get(const std::vector<ParameterType>& types,
-                          const ParameterConfig& paraConfig);
-};
-
-// L1 Regularizer, |w|_1
-class L1Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-  }
-};
-
-// L1 Lr Regularizer
-class L1LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-  }
-};
-
-// L2 Regularizer, |w|_2^2
-class L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-// L2 Lr Regularizer
-class L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-// L1 + L2 Regularizer, |w|_1 + |w|_2^2
-class L1L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-// L1 + L2 Lr Regularizer
-class L1L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ThreadLocalBuffer.cpp b/paddle/legacy/parameter/ThreadLocalBuffer.cpp
deleted file mode 100644
index 550e41dfd..000000000
--- a/paddle/legacy/parameter/ThreadLocalBuffer.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ThreadLocalBuffer.h"
-#include "Parameter.h"
-
-namespace paddle {
-namespace parameter {
-
-static ThreadLocal<std::vector<VectorPtr>> tlsTempBufs_;
-
-VectorPtr* getThreadLocalBuffer() {
-  std::vector<VectorPtr>& bufs = *tlsTempBufs_;
-  if (bufs.empty()) {
-    bufs.resize(NUM_PARAMETER_TYPES);
-    for (auto& vec : bufs) {
-      vec.reset(new CpuVector(0, nullptr));
-    }
-  }
-  return bufs.data();
-}
-
-}  // namespace parameter
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/ThreadLocalBuffer.h b/paddle/legacy/parameter/ThreadLocalBuffer.h
deleted file mode 100644
index d360feeed..000000000
--- a/paddle/legacy/parameter/ThreadLocalBuffer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/legacy/math/Vector.h"
-
-namespace paddle {
-namespace parameter {
-extern VectorPtr* getThreadLocalBuffer();
-}  // namespace parameter
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Weight.cpp b/paddle/legacy/parameter/Weight.cpp
deleted file mode 100644
index 9d94050a5..000000000
--- a/paddle/legacy/parameter/Weight.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Weight.h"
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-Weight::Weight(size_t height, size_t width, ParameterPtr param) {
-  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
-  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
-
-  // create a new weight
-  if (param->isSparse()) {
-    CHECK_LE(param->getSize(), width * height);
-  } else {
-    CHECK_EQ(param->getSize(), width * height);
-  }
-
-  // weight_
-  weight_ = param->getMat(PARAMETER_VALUE);
-  if (!weight_ && vPtr) {
-    weight_ = Matrix::create(vPtr->getMemoryHandle(), height, width);
-  }
-  if (weight_) {
-    CHECK_EQ(height, weight_->getHeight());
-    CHECK_EQ(width, weight_->getWidth());
-  }
-
-  // weightGrad
-  weightGrad_ = param->getMat(PARAMETER_GRADIENT);
-  if (!weightGrad_ && gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getMemoryHandle(), height, width);
-  }
-  if (weightGrad_) {
-    CHECK_EQ(height, weightGrad_->getHeight());
-    CHECK_EQ(width, weightGrad_->getWidth());
-  }
-
-  parameter_ = param;
-}
-
-Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
-  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
-  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
-
-  // create a new weight
-  CHECK_LE(offset + width * height, param->getSize());
-
-  // weight_
-  if (vPtr) {
-    weight_ = Matrix::create(vPtr->getData() + offset,
-                             height,
-                             width,
-                             /* trans */ false,
-                             param->useGpu());
-  }
-
-  // weightGrad
-  if (gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getData() + offset,
-                                 height,
-                                 width,
-                                 /* trans */ false,
-                                 param->useGpu());
-  }
-
-  parameter_ = param;
-}
-
-const ParameterPtr& Weight::getParameterPtr() { return parameter_; }
-void Weight::setParameterPtr(ParameterPtr param) { parameter_ = param; }
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/Weight.h b/paddle/legacy/parameter/Weight.h
deleted file mode 100644
index 241c8d829..000000000
--- a/paddle/legacy/parameter/Weight.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <vector>
-
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/legacy/parameter/Parameter.h"
-
-namespace paddle {
-
-class Weight {
- private:
-  MatrixPtr weight_;
-  MatrixPtr weightGrad_;
-  ParameterPtr parameter_;
-
- public:
-  Weight(size_t height, size_t width, ParameterPtr parameter);
-  Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
-
-  const MatrixPtr& getW() { return weight_; }
-  const MatrixPtr& getWGrad() { return weightGrad_; }
-  const ParameterPtr& getParameterPtr();
-
-  void incUpdate(const UpdateCallback& callback) {
-    getParameterPtr()->incUpdate(callback);
-  }
-
-  void setParameterPtr(ParameterPtr param);
-};
-
-typedef std::vector<std::unique_ptr<Weight>> WeightList;
-
-}  // namespace paddle
diff --git a/paddle/legacy/parameter/tests/CMakeLists.txt b/paddle/legacy/parameter/tests/CMakeLists.txt
deleted file mode 100644
index 181ccdc1f..000000000
--- a/paddle/legacy/parameter/tests/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_simple_unittest(test_common)
-add_simple_unittest(test_argument)
diff --git a/paddle/legacy/parameter/tests/test_argument.cpp b/paddle/legacy/parameter/tests/test_argument.cpp
deleted file mode 100644
index 0c632e0cd..000000000
--- a/paddle/legacy/parameter/tests/test_argument.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/parameter/Argument.h>
-
-using namespace paddle;  // NOLINT
-
-TEST(Argument, poolSequenceWithStride) {
-  Argument input, output;
-  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
-  int* inStart = input.sequenceStartPositions->getMutableData(false);
-  inStart[0] = 0;
-  inStart[1] = 9;
-  inStart[2] = 14;
-  inStart[3] = 17;
-  inStart[4] = 30;
-
-  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
-  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
-
-  for (auto reversed : {false, true}) {
-    ICpuGpuVectorPtr stridePositions;
-    output.poolSequenceWithStride(
-        input, 5 /* stride */, &stridePositions, reversed);
-
-    const int* outStart = output.sequenceStartPositions->getData(false);
-    CHECK_EQ(outStart[0], 0);
-    CHECK_EQ(outStart[1], 2);
-    CHECK_EQ(outStart[2], 3);
-    CHECK_EQ(outStart[3], 4);
-    CHECK_EQ(outStart[4], 7);
-
-    CHECK_EQ(stridePositions->getSize(), 8UL);
-    auto result = reversed ? strideResultReversed : strideResult;
-    for (int i = 0; i < 8; i++) {
-      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/parameter/tests/test_common.cpp b/paddle/legacy/parameter/tests/test_common.cpp
deleted file mode 100644
index 8de9d6da9..000000000
--- a/paddle/legacy/parameter/tests/test_common.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/Util.h>
-#include <stdlib.h>
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
-#include <paddle/legacy/utils/Flags.h>
-#include <paddle/legacy/utils/Stat.h>
-#include <paddle/legacy/utils/Thread.h>
-
-using namespace paddle;  // NOLINT
-
-class CommonTest : public ::testing::Test {
- protected:
-  CommonTest() : testStat_("test") {}
-  virtual ~CommonTest() {}
-  virtual void SetUp() {
-    const size_t buffSize[] = {
-        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
-    sizeVec_.resize(8);
-    memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
-    valueUint_.resize(4);
-    valueUint_[0].first = 0.0;
-    valueUint_[0].second = 0.0;
-    valueUint_[1].first = 0.0;
-    valueUint_[1].second = 1.0;
-    valueUint_[2].first = 1.0;
-    valueUint_[2].second = 0.0;
-    valueUint_[3].first = 1.0;
-    valueUint_[3].second = 1.0;
-    learningRate_ = 1.0;
-  }
-
-  void test_sgdUpadate(real* gradientBuffer,
-                       real* valueBuffer,
-                       real* momentumBuffer,
-                       size_t size);
-
-  virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
-
- protected:
-  std::vector<std::pair<real, real>> valueUint_;
-  std::vector<size_t> sizeVec_;
-  real learningRate_;
-  StatSet testStat_;
-};
-
-void CommonTest::test_sgdUpadate(real* gradientBuffer,
-                                 real* valueBuffer,
-                                 real* momentumBuffer,
-                                 size_t size) {
-// sgdUpdateAvx has no double version yet
-#if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
-  real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
-  real* gradTmp = new real[size];
-  real* valueTmp = new real[size];
-  real* momentumTmp = new real[size];
-  memcpy(gradTmp, gradientBuffer, size * sizeof(real));
-  memcpy(valueTmp, valueBuffer, size * sizeof(real));
-  memcpy(momentumTmp, momentumBuffer, size * sizeof(real));
-  for (auto& arg : valueUint_) {
-    {
-      {
-        struct timeval t;
-        REGISTER_TIMER("gettimeofday", 0, testStat_);
-        gettimeofday(&t, NULL);
-      }
-      REGISTER_TIMER("avxTimer", 0);
-      sgdUpdateAvx(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueBuffer,
-                   gradientBuffer,
-                   momentumBuffer);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum1 += valueBuffer[i];
-      momSum1 += momentumBuffer[i];
-      // std::cout << "["
-      //          << valueBuffer[i]
-      //          << "," << momentumBuffer[i]
-      //          << "," << gradientBuffer[i] << "],";
-    }
-    {
-      REGISTER_TIMER("cpuTimer", 0);
-      sgdUpdateCpu(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueTmp,
-                   gradTmp,
-                   momentumTmp);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum2 += valueTmp[i];
-      momSum2 += momentumTmp[i];
-      // std::cout << "["
-      //          << valueTmp[i]
-      //          << "," << momentumTmp[i]
-      //          << "," << gradTmp[i] << "],";
-    }
-
-    VLOG(3) << "valueSum1 = " << valueSum1 << " ; valueSum2 = " << valueSum2;
-    VLOG(3) << "momSum1 = " << momSum1 << " ; momSum2 = " << momSum2;
-    ASSERT_EQ(valueSum1, valueSum2);
-    ASSERT_EQ(momSum1, momSum2);
-  }
-  delete[] gradTmp;
-  delete[] valueTmp;
-  delete[] momentumTmp;
-#endif
-}
-
-TEST_F(CommonTest, sgdUpdate) {
-  const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
-  for (auto& size : sizeVec_) {
-    real *gradientBuffer, *valueBuffer, *momentumBuffer;
-    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
-             0);
-    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
-    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
-             0);
-
-    for (size_t i = 0; i < size; i++) {
-      gradientBuffer[i] = 1.0;
-      valueBuffer[i] = 2.0;
-      momentumBuffer[i] = 3.0;
-    }
-    for (int i = 0; i < 6; i++) {
-      LOG(INFO) << "----------------------" << size << ":" << alignHeader[i]
-                << "-------------------------";
-      test_sgdUpadate(&gradientBuffer[alignHeader[i]],
-                      &valueBuffer[alignHeader[i]],
-                      &momentumBuffer[alignHeader[i]],
-                      size - alignHeader[i]);
-    }
-    free(gradientBuffer);
-    free(valueBuffer);
-    free(momentumBuffer);
-  }
-  globalStat.printAllStatus();
-  testStat_.printAllStatus();
-}
-
-TEST_F(CommonTest, syncThreadPool) {
-  SyncThreadPool pool(10);
-
-  std::vector<int> nums;
-  nums.resize(10);
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] = tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)i, nums[i]);
-  }
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] -= tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)0, nums[i]);
-  }
-}
diff --git a/paddle/legacy/pserver/BaseClient.cpp b/paddle/legacy/pserver/BaseClient.cpp
deleted file mode 100644
index 13bb8a1cc..000000000
--- a/paddle/legacy/pserver/BaseClient.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BaseClient.h"
-#include <gflags/gflags.h>
-#include <string.h>
-#include <vector>
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_string(pservers);
-
-namespace paddle {
-
-BaseClient::BaseClient(bool separate, int numPorts)
-    : stopping_(false), numPorts_(numPorts), separateSendAndRecv_(separate) {
-  CHECK_GT(numPorts, 0);
-}
-
-BaseClient::~BaseClient() {}
-
-void BaseClient::recvData() { recvSyncBarrier_->wait(); }
-
-void BaseClient::synchronize(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void BaseClient::startThreads() {
-  if (!separateSendAndRecv_) {
-    return;
-  }
-  recvSyncBarrier_.reset(new ThreadBarrier(threadNum_ + 1));
-
-  sendThreads_.resize(threadNum_);
-  recvThreads_.resize(threadNum_);
-  sendJobQueue_.resize(threadNum_);
-  recvJobQueue_.resize(threadNum_);
-
-  for (int i = 0; i < threadNum_; ++i) {
-    sendJobQueue_[i].reset(new SendQueue());
-    recvJobQueue_[i].reset(new SendQueue());
-
-    sendThreads_[i].reset(
-        new std::thread([this](int id) { this->send(id); }, i));
-
-    recvThreads_[i].reset(
-        new std::thread([this](int id) { this->recv(id); }, i));
-  }
-}
-
-void BaseClient::finishThreads() {
-  if (!separateSendAndRecv_) {
-    return;
-  }
-  stopping_ = true;
-  for (int i = 0; i < threadNum_; i++) {
-    sendJobQueue_[i]->enqueue(nullptr);
-  }
-  for (auto& thread : sendThreads_) {
-    thread->join();
-  }
-  for (auto& thread : recvThreads_) {
-    thread->join();
-  }
-  stopping_ = false;
-}
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/BaseClient.h b/paddle/legacy/pserver/BaseClient.h
deleted file mode 100644
index 66e8f39cd..000000000
--- a/paddle/legacy/pserver/BaseClient.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterService.pb.h"
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/pserver/ProtoServer.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Queue.h"
-
-namespace paddle {
-
-/**
- * it manages all connections to pservers.
- * it exists two modes to manage connections to all pservers. Firstly, one
- * connection owns two threads that separately manage to send and receive
- * data. Secondly, each thread uses one connection for all activation in it.
- * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
- * recvJobQueue_. the second solution use some shared thread pool to manage
- * connections.
- */
-class BaseClient {
- protected:
-  typedef std::unique_ptr<std::thread> ThreadPtr;
-  typedef std::vector<std::vector<iovec>> InputIovs;
-  typedef std::vector<SendParameterRequest> SendRequest;
-  typedef std::vector<SendDataRequest> SendDataRequestVec;
-
-  // TODO(yanfei):
-  // refine data structure to unify parameter and features communication
-  struct SendJob {
-    /// store parameters related blocks data
-    InputIovs parallelInputIovs;
-    /// store protobuf request
-    SendRequest parallelRequests;
-    /// store data, such as features for metric learning
-    SendDataRequestVec parallelDataRequests;
-  };
-
- public:
-  explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
-
-  virtual ~BaseClient();
-
-  typedef std::shared_ptr<SendJob> SendJobPtr;
-  typedef Queue<SendJobPtr> SendQueue;
-
-  /// send data to server, support only synchronize
-  template <class DataType>
-  void putData(int clientId,
-               SendDataType type,
-               DataType* datas,
-               size_t size,
-               DataUpdateMode mode) {
-    synchronize(SYNC_DATA);
-    sendData(clientId, type, mode, datas, size);
-    recvData();
-    synchronize(SYNC_DATA);
-  }
-
-  template <class DataType>
-  void putOwnData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
-  }
-
-  template <class DataType>
-  void getAllData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    sendData(clientId,
-             type,
-             DATA_UPDATE_MODE_GET_ALL,
-             reinterpret_cast<DataType*>(NULL),
-             0);
-    recvData();
-    size_t dataOffset = 0;
-    for (auto& recvMem : recvDataMems_) {
-      CHECK_LE(dataOffset, size);
-      size_t memSize = std::min(recvMem.get()->getSize(),
-                                sizeof(DataType) * (size - dataOffset));
-      CHECK_EQ(memSize % sizeof(DataType), size_t(0));
-      memcpy(datas + dataOffset, recvMem.get()->getBuf(), memSize);
-      dataOffset += memSize / sizeof(DataType);
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * Reduces values on all clients.
-   * This reduce just support SUM.
-   * The results are saved in recvBuf of rootId client
-   */
-  template <class DataType>
-  void reduce(DataType* sendBuf,
-              DataType* recvBuf,
-              size_t size,
-              int clientId,
-              int rootId) {
-    putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
-    if (rootId == clientId) {
-      getAllData(clientId, DATA_REDUCE_SUM, recvBuf, size);
-    }
-  }
-
-  /**
-   * return trans data type according to the input type
-   */
-  virtual TransDataType getTransDtype(const std::type_info& info) {
-    TransDataType dataType;
-    if (typeid(int*) == info) {  // NOLINT
-      dataType = TRANS_INT32;
-    } else if (typeid(uint32_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT32_T;
-    } else if (typeid(int64_t*) == info) {  // NOLINT
-      dataType = TRANS_INT64_T;
-    } else if (typeid(uint64_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT64_T;
-    } else if (typeid(float*) == info) {  // NOLINT
-      dataType = TRANS_FLOAT;
-    } else if (typeid(double*) == info) {  // NOLINT
-      dataType = TRANS_DOUBLE;
-    } else {
-      LOG(FATAL) << "not supported";
-    }
-    return dataType;
-  }
-
- protected:
-  /// for a > 0, b > 0:
-  /// return the smallest x s.t. b*x >= a
-  static int divup(int a, int b) { return (a + b - 1) / b; }
-
-  int calcClientId(int i, int serviceNum) {
-    return (i + FLAGS_trainer_id * numPorts_) % serviceNum;
-  }
-
-  /// start threads in sendThreads_ and recvThreads_
-  void startThreads();
-
-  /// finish threads in sendThreads_ and recvThreads_
-  void finishThreads();
-
-  template <class DataType>
-  void prepareData(int clientId,
-                   SendDataType type,
-                   DataUpdateMode updateMode,
-                   DataType* datas,
-                   size_t size,
-                   SendJob* sendJob) {
-    sendJob->parallelDataRequests.resize(serviceNum_);
-    sendJob->parallelInputIovs.resize(serviceNum_);
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      request.set_update_mode(updateMode);
-      request.set_type(type);
-      request.set_client_id(clientId);
-      request.set_server_id(i);
-    }
-
-    /// split datas which need send to Server into serviceNum_ pieces
-    if (!datas) {
-      CHECK(!size) << "ownSize should be zero since datas is nullptr";
-    }
-    size_t baseSize = size / serviceNum_;
-    size_t dataOffset = 0;
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      DataBlock* block = request.add_blocks();
-      size_t ownSize = size_t(i) < size % serviceNum_ ? baseSize + 1 : baseSize;
-      size_t realSize = datas ? std::max(ownSize, size_t(1)) : 0;
-      block->set_total_size(realSize * sizeof(DataType));
-      block->set_data_size(sizeof(DataType));
-      // TODO(yuyang18): The getTransDtype can be rewritten as template method
-      //                 to reduce runtime overhead.
-      block->set_data_type(getTransDtype(typeid(DataType*)));  // NOLINT
-      if (datas) {
-        sendJob->parallelInputIovs[i].push_back(
-            {datas + dataOffset, realSize * sizeof(DataType)});
-      }
-      dataOffset += ownSize;
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * @brief send data to all data servers
-   *
-   * @note  each trainer sends all its data to all data servers
-   *        it's for broadcast data synchronization, such as features
-   *        synchronization in metric learning.
-   */
-  template <class DataType>
-  void sendData(int clientId,
-                SendDataType type,
-                DataUpdateMode updateMode,
-                DataType* datas,
-                size_t size) {
-    SendJobPtr sendJob = std::make_shared<SendJob>();
-    prepareData(clientId, type, updateMode, datas, size, sendJob.get());
-    for (int i = 0; i < threadNum_; ++i) {
-      sendJobQueue_[i]->enqueue(sendJob);
-    }
-  }
-
-  /**
-   * @brief recv data from all data servers
-   *
-   * @note  synchronize all recv threads
-   */
-  void recvData();
-
-  /// send request, and recv responses
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
-  /**
-   * @brief synchronize all trainers and pservers
-   *
-   * @note  used to ensure that data of all trainers have been received
-   */
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /**
-   * @brief use multithread to separately send data
-   *
-   * @note  each thread should read its own JobQueue to handle requests
-   *        each thread should calcClientId() to retrieve connections
-   *        managed by himself.
-   *        send and recv are implemented in child class.
-   */
-  virtual void send(int threadId) = 0;
-
-  /**
-   * @brief use multithread to separately receive data
-   *
-   * @note  almost same as send()
-   */
-  virtual void recv(int threadId) = 0;
-
- protected:
-  bool stopping_;
-  /// nodes * ports that means the number of real pservers
-  int serviceNum_;
-  /**
-   * threads num for managing all services. Normally the
-   * number of pservers are relatively less than several
-   * hundreds so that using thread-based parallelization
-   * can benifit traffic performance and pserver's sgd
-   * optimization performance.
-   */
-  int threadNum_;
-  /// the connection manager at client end
-  std::vector<ProtoClient> clients_;
-  /// send threads for parallelization
-  std::vector<ThreadPtr> sendThreads_;
-  /// recv threads for parallelization
-  std::vector<ThreadPtr> recvThreads_;
-  std::unique_ptr<ThreadBarrier> recvSyncBarrier_;
-
-  // TODO(yanfei):
-  // current pserver's will return value until all parameters'
-  // optimization are finished so that recv are not overlapped
-  // in reality. More robust implimentation should be to pipeline
-  // all send/recv action based on parameter unit level, and
-  // it will benifits deep and larger model training in future,
-  // especially local node compution power surpasses inter-connection
-  // such as GPU cluster, even with BOX GPU cluster.
-  // queue for buffering send request
-  /**
-   * send/recv queue cooperates with each other to accomplish
-   * overlapping communication with forwardBackward action.
-   */
-  std::vector<std::unique_ptr<SendQueue>> sendJobQueue_;
-  /// queue for buffering recv request
-  std::vector<std::unique_ptr<SendQueue>> recvJobQueue_;
-  /// specific for dserver
-  SendJob sendJob_;
-  /// port num for each node
-  int numPorts_;
-  /// if set, overlapped optimization is disabled
-  bool separateSendAndRecv_;
-  std::vector<CpuMemHandlePtr> recvDataMems_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/CMakeLists.txt b/paddle/legacy/pserver/CMakeLists.txt
deleted file mode 100644
index 0ae9c6ef6..000000000
--- a/paddle/legacy/pserver/CMakeLists.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-# parameter server package
-
-######################### paddle_network ####################
-set(NETWORK_SOURCES
-    LightNetwork.cpp
-    SocketChannel.cpp
-    ProtoServer.cpp)
-
-set(NETWORK_HEADERS
-    LightNetwork.h
-    SocketChannel.h
-    ProtoServer.h)
-
-add_library(paddle_network STATIC
-    ${NETWORK_SOURCES})
-
-add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
-
-################### paddle_pserver ######################
-set(PSERVER_SOURCES
-    BaseClient.cpp
-    ParameterClient2.cpp
-    ParameterServer2.cpp
-    SparseParameterDistribution.cpp
-    ParameterServerController.cpp)
-
-set(PSERVER_HEADERS
-    BaseClient.h
-    ParameterClient2.h
-    ParameterServer2.h
-    SparseParameterDistribution.h
-    ParameterServerController.h)
-
-add_library(paddle_pserver STATIC
-    ${PSERVER_SOURCES})
-
-add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
-
-set(PSERVER_MAIN_SOURCES
-    ParameterServer2Main.cpp)
-
-if(WITH_TESTING)
-  add_subdirectory(test)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-  add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
-  link_paddle_exe(paddle_pserver_main)
-
-  install(TARGETS paddle_pserver_main
-          RUNTIME DESTINATION opt/paddle/bin
-          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-
-  set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-endif()
diff --git a/paddle/legacy/pserver/LightNetwork.cpp b/paddle/legacy/pserver/LightNetwork.cpp
deleted file mode 100644
index 469c95853..000000000
--- a/paddle/legacy/pserver/LightNetwork.cpp
+++ /dev/null
@@ -1,459 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fcntl.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <chrono>
-
-#include <arpa/inet.h>
-#include <net/if.h>
-#include <sys/ioctl.h>
-#include <sstream>
-
-#include "LightNetwork.h"
-#include "RDMANetwork.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-/// quick ack can reduce the latency of small message
-DEFINE_bool(small_messages,
-            false,
-            "if message size is small, recommend set it True to enable quick "
-            "ack and no delay");
-
-/// reasonable sock_send_buf_size can control the traffic injected into switch
-/// network. Injecting too many data into traffic could cause packets loss which
-/// cause long latency and degrade the efficiency of communication.
-DEFINE_int32(sock_send_buf_size,
-             1024 * 1024 * 40,
-             "restrict sock send buff size, can reduce network congestion if "
-             "set carefully");
-
-/// reasonable size can hold bursted packets and reduce packets loss
-DEFINE_int32(sock_recv_buf_size,
-             1024 * 1024 * 40,
-             "restrict sock recv buff size");
-
-/// reasonable sock_listen_queue_size can control maximum pending connections.
-DEFINE_int32(sock_listen_queue_size,
-             1024,
-             "listen queue size when pserver listen a TCP port");
-
-namespace paddle {
-
-/**
- * @brief get ip address from interface name
- *
- * @param[in] device device interface name
- */
-std::string getIpAddr(std::string &device) {
-  int sock;
-  struct sockaddr_in sin;
-  struct ifreq ifr;
-
-  sock = socket(AF_INET, SOCK_DGRAM, 0);
-  CHECK(sock >= 0) << "Create socket error.";
-
-  strncpy(ifr.ifr_name, device.c_str(), IFNAMSIZ);
-  ifr.ifr_name[IFNAMSIZ - 1] = 0;
-
-  CHECK_GE(ioctl(sock, SIOCGIFADDR, &ifr), 0);
-  memcpy(&sin, &ifr.ifr_addr, sizeof(sin));
-  close(sock);
-  return std::string(inet_ntoa(sin.sin_addr));
-}
-
-/**
- * @brief set sock option
- *
- * @param[in] sockfd sock file descriptor
- *
- * @note adjust some default sock option for better performance
- */
-void setOption(int sockfd) {
-#if !defined(__APPLE__) && !defined(__OSX__)
-  int sendSize = FLAGS_sock_send_buf_size;
-  int recvSize = FLAGS_sock_recv_buf_size;
-  CHECK_GE(
-      setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &recvSize, sizeof(recvSize)),
-      0);
-  CHECK_GE(
-      setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
-      0);
-#endif
-
-  if (FLAGS_small_messages) {
-    int optval = 1;
-    CHECK_GE(
-        setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
-        0);
-#ifdef TCP_QUICKACK
-    optval = 1;
-    CHECK_GE(
-        setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
-        0);
-#endif
-  }
-  int reuse = 1;
-  CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
-           0);
-}
-
-/**
- * @brief class constructor for SocketServer
- * @param[in] addr sock bind address
- * @param[in] port sock bind port
- * @param[in] rdmaCpu rdma sock bind cpu core
- *
- * @note start one socket server which hosts parameter server process.
- *       rdmaCpu is passed to rdma deamon for better performance, and
- *       start tcp socket instead of rdma socket if rdmaCpu is equal
- *       to -1. Each trainer process starts one connection to one socket
- *       server, and use --ports_num to build more connections to harness
- *       fat communication channel if necessary.
- *       each connection is controlled by single thread with blocking
- *       read and write.
- */
-SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
-    : port_(port), addr_(addr), stopping_(false) {
-  if (rdmaCpu == -1) {
-    tcpRdma_ = F_TCP;
-    socket_ = 0;
-    maxPendingConnections_ = FLAGS_sock_listen_queue_size;
-  } else {
-    tcpRdma_ = F_RDMA;
-    rdmaCpu_ = rdmaCpu;
-    rdmaSocket_ = 0;
-
-    std::stringstream ss;
-    ss << port;
-    rdmaUri_ = "rdma://" + addr + ":" + ss.str();
-  }
-
-  /// trigger to initialize RDMA lib
-  CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
-}
-
-SocketServer::~SocketServer() {
-  stopping_ = true;
-  /// trigger accept thread to stop
-  {
-    SocketClient trigger(addr_.empty() ? "127.0.0.1" : addr_, port_, tcpRdma_);
-  }
-  this->join();
-}
-
-/**
- * @brief start one tcp server which hosts parameter server
- *
- * @note do tcp socket bind and listen. it will spawn one thread
- *       for each connection
- */
-void SocketServer::tcpServer() {
-  int newsockfd;
-  socklen_t clilen;
-  struct sockaddr_in serv_addr, cli_addr;
-  struct hostent *server;
-
-  /// First call to socket() function
-  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(socket_ >= 0) << "ERROR opening socket";
-
-  /// Initialize socket structure
-  bzero((char *)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_port = htons(port_);
-  if (!addr_.empty()) {
-    server = gethostbyname(addr_.c_str());
-    CHECK(server) << "ERROR, no such host: " << addr_;
-    bcopy((char *)server->h_addr,
-          (char *)&serv_addr.sin_addr.s_addr,
-          server->h_length);
-  } else {
-    serv_addr.sin_addr.s_addr = INADDR_ANY;
-  }
-
-  setOption(socket_);
-
-  /// Now bind the host address using bind() call.
-  CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR on binding " << addr_;
-
-  /// Now start listening for the clients, here process will
-  /// go in sleep mode and will wait for the incoming connection
-  listen(socket_, maxPendingConnections_);
-  clilen = sizeof(cli_addr);
-
-  while (true) {
-    /// Accept actual connection from the client
-    newsockfd = accept(socket_, (struct sockaddr *)&cli_addr, &clilen);
-    if (stopping_) {
-      break;
-    }
-    CHECK(newsockfd >= 0) << "ERROR on accept";
-    constexpr int kPeerNameLen = 128;
-    char peerName[kPeerNameLen];
-    CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
-
-    SocketWorker *worker =
-        new SocketWorker(createChannel(newsockfd, std::string(peerName)), this);
-    worker->start();
-    worker->detach();
-  }
-  close(socket_);
-  LOG(INFO) << "pserver accept thread finish, addr=" << addr_
-            << " port=" << port_;
-}
-
-/**
- * @brief start one rdma server which hosts parameter server
- *
- * @note do rdma bind and listen, which calling self-defined socket
- *       like rdma library. it will spawn one thread for each connection
- */
-void SocketServer::rdmaServer() {
-  struct sxi_sock *newsock;
-
-  /// First call to socket() function
-  rdmaSocket_ = rdma::ssocket(rdmaCpu_);
-  CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
-
-  CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
-      << "ERROR bind RDMA socket";
-
-  /// Now start listening for the clients, here process will
-  /// go in sleep mode and will wait for the incoming connection
-  CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
-
-  while (true) {
-    /// Accept actual connection from the client
-    newsock = rdma::accept(rdmaSocket_);
-    if (stopping_) {
-      break;
-    }
-    CHECK(newsock) << "ERROR on accept";
-
-    constexpr int kPeerNameLen = 128;
-    char peerName[kPeerNameLen];
-
-    struct sockaddr_in *saddr = rdma::getSourceAddress(newsock);
-    CHECK(inet_ntop(AF_INET, &saddr->sin_addr, peerName, kPeerNameLen));
-
-    SocketWorker *worker =
-        new SocketWorker(createChannel(newsock, std::string(peerName)), this);
-    worker->start();
-    worker->detach();
-  }
-  rdma::close(rdmaSocket_);
-  LOG(INFO) << "pserver accept thread finish, rdma uri=" << rdmaUri_;
-}
-
-/**
- * @brief start a socket server
- *
- * @note framework for starting socket server
- */
-void SocketServer::run() {
-  if (tcpRdma_ == F_TCP) {
-    LOG(INFO) << "tcp server start ";
-    tcpServer();
-  } else if (tcpRdma_ == F_RDMA) {
-    LOG(INFO) << "rdma server start ";
-    rdmaServer();
-  }
-}
-
-/**
- * @brief class constructor for rdma client deamons
- *
- * @note  automatically start several client deamons for better performance
- */
-std::unique_ptr<RdmaClientDaemons> RdmaClientDaemons::daemons_ = nullptr;
-std::once_flag RdmaClientDaemons::initDataFlag_;
-
-RdmaClientDaemons::RdmaClientDaemons() {
-  if (FLAGS_rdma_tcp == "rdma") {
-    rdma::init();
-
-    struct sxi_socket *socket;
-    onlineCpus_ = rdma::numCpus();
-    for (auto i = 0; i < onlineCpus_; i++) {
-      socket = rdma::csocket(i);
-      CHECK(socket) << "ERROR open client socket daemon";
-
-      rdmaClientSocket_.push_back(socket);
-    }
-    LOG(INFO) << "RDMA client daemons started, onlineCpus_:" << onlineCpus_;
-    /// round robin scheduler for new connection
-    curCpu_ = 0;
-    /// wait daemons to start completely.
-    sleep(2);
-  }
-}
-
-RdmaClientDaemons::~RdmaClientDaemons() {
-  if (FLAGS_rdma_tcp == "rdma") {
-    for (auto i = 0; i < onlineCpus_; i++) {
-      rdma::close(rdmaClientSocket_[i]);
-    }
-    LOG(INFO) << "RDMA client daemons is destoryed, onlineCpus_ "
-              << onlineCpus_;
-  }
-}
-
-/**
- * @brief worker thread main context
- *
- * @note  each connection from client(trainer) is controlled by single worker
- *        thread, which is for handling all parameter server requests
- */
-void SocketWorker::run() {
-  LOG(INFO) << "worker started, peer = " << channel_->getPeerName();
-
-  std::vector<iovec> inputIovs;
-
-  while (true) {
-    std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
-    if (!msgReader) {
-      break;
-    }
-
-    auto callback = [this](const std::vector<iovec> &outputIovs) {
-      channel_->writeMessage(outputIovs);
-    };
-
-    server_->handleRequest(std::move(msgReader), callback);
-  }
-
-  LOG(INFO) << "worker begin to finish, peer = " << channel_->getPeerName();
-  delete this;
-}
-
-/**
- * @brief start one tcp connection to tcp server
- * @param[in] serverAddr  tcp server ip
- * @param[in] serverPort  tcp server port
- *
- * @note each object contains one channel which accept byte stream
- */
-void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
-  struct sockaddr_in serv_addr;
-  struct hostent *server;
-
-  int errRet;  // temp for gethostbyname_r
-
-  /// Create a socket point
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(sockfd >= 0) << "ERROR opening socket";
-
-#if defined(__OSX__) || defined(__APPLE__)
-  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
-  CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr
-                                   << " ret = " << errRet;
-  CHECK(server) << "getipnodebyname error!";
-#else
-  struct hostent hostinfo;
-  char buf[1024];  // temp for gethostbyname_r
-  CHECK_EQ(
-      0,
-      gethostbyname_r(
-          serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet))
-      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-  CHECK(server) << "gethostbyname_r error!";
-#endif
-
-  bzero((char *)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  bcopy((char *)server->h_addr,
-        (char *)&serv_addr.sin_addr.s_addr,
-        server->h_length);
-  serv_addr.sin_port = htons(serverPort);
-
-  setOption(sockfd);
-
-  /// Now connect to the server
-  int retry_count = 0;
-  do {
-    if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) {
-      break;
-    }
-
-    if (errno == ECONNREFUSED) {
-      LOG(WARNING) << "connection refused by pserver, try again!";
-      if (retry_count++ >= 7) {
-        LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
-      }
-      std::this_thread::sleep_for(std::chrono::seconds(1));
-    } else {
-      CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
-                        << serverPort << "errorno: " << errno;
-    }
-  } while (errno == ECONNREFUSED);
-
-  channel_.reset(new SocketChannel(sockfd, serverAddr));
-  tcpRdma_ = F_TCP;
-}
-
-/**
- * @brief start one RDMA connection to rdma server
- * @param[in] serverAddr  rdma server ip
- * @param[in] serverPort  rdma server port
- *
- * @note  each object contains one channel which accept byte stream
- *        for rdma, low level sock also provide byte stream api.
- */
-void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
-  struct sxi_sock *sock;
-
-  std::stringstream ss;
-  ss << serverPort;
-
-  std::string rdmaUri = "rdma://" + serverAddr + ":" + ss.str();
-
-  RdmaClientDaemons *daemons = RdmaClientDaemons::daemons_->get();
-  socketDaemon_ = daemons->selectDaemon();
-
-  /// connect to server with socket daemon
-  sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
-  CHECK(sock) << "ERROR connect to server" << rdmaUri;
-
-  std::vector<std::string> seg;
-  str::split(rdmaUri, '/', &seg);
-  std::string server = seg.at(seg.size() - 1);
-  channel_.reset(new SocketChannel(sock, server));
-  tcpRdma_ = F_RDMA;
-}
-
-/**
- * @brief class constructor
- * @param[in] serverAddr pserver ip address
- * @param[in] serverPort pserver port
- * @param[in] ChannelType F_TCP or F_RDMA
- *
- * @note  responsible for building one connection to specified pserver port
- */
-SocketClient::SocketClient(const std::string &serverAddr,
-                           int serverPort,
-                           enum ChannelType channelType) {
-  if (channelType == F_RDMA)
-    RdmaClient(serverAddr, serverPort);
-  else
-    TcpClient(serverAddr, serverPort);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/LightNetwork.h b/paddle/legacy/pserver/LightNetwork.h
deleted file mode 100644
index 380f86832..000000000
--- a/paddle/legacy/pserver/LightNetwork.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SocketChannel.h"
-
-#include <atomic>
-#include <memory>
-#include <thread>
-#include <vector>
-
-#include "paddle/legacy/utils/Thread.h"
-
-struct sxi_socket;
-
-namespace paddle {
-
-class SocketWorker;
-
-/**
- * @brief class for holding all parameters processing for current port
- *
- * @note  each parameter server inherits from one socket server, each
- *        server contains serveral woker threads which are to parallelize
- *        the processing of computation, but share some common datas stored
- *        in child class of socketserver.
- */
-class SocketServer : public Thread {
-  // rdmaCpu controls the cpu affinity of RDMA server daemon,
-  // which could benifit performance. rdmaCpu = -1 means TCP
-  // is used instead of RDMA transport.
- public:
-  SocketServer(const std::string& addr, int port, int rdmaCpu);
-  ~SocketServer();
-
-  virtual void run();
-
-  typedef std::function<void(const std::vector<iovec>& outputIovs)>
-      ResponseCallback;
-
- protected:
-  //
-  // The derived class needs to implement this function
-  // to handle the request received by SocketWorker
-  // The request is encapsulated by MsgReader, which contains
-  // a set of blocks.
-  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback) = 0;
-
-  std::unique_ptr<SocketChannel> createChannel(int sock,
-                                               const std::string& peerName) {
-    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
-  }
-  std::unique_ptr<SocketChannel> createChannel(struct sxi_sock* sock,
-                                               const std::string& peerName) {
-    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
-  }
-
-  friend class SocketWorker;
-
- private:
-  void rdmaServer();
-  void tcpServer();
-
-  void detach() {}  // detach accept thread is forbidden
-
- protected:
-  enum ChannelType tcpRdma_;
-  // for rdma
-  int rdmaCpu_;
-  std::string rdmaUri_;
-  sxi_socket* rdmaSocket_;
-  // for tcp
-  int port_;
-  std::string addr_;
-  int socket_;
-  int maxPendingConnections_;
-  bool stopping_;
-};
-
-/**
- * @brief class for holding one connection from one trainer
- *
- * @note  all parameter processing will run in the context of this worker
- */
-class SocketWorker : public Thread {
- public:
-  SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
-      : channel_(std::move(channel)), server_(server) {}
-
-  virtual ~SocketWorker() {}
-
-  virtual void run();
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-  SocketServer* server_;
-  enum ChannelType tcpRdma_;
-};
-
-/**
- * @brief class for providing rdma client deamon thread
- *
- * @note  the deamons are required by sock like rdam library. Here
- *        use singleton model for daemons. Each deamon hosts in
- *        single cpu core for better load balance performance
- */
-class RdmaClientDaemons {
- private:
-  RdmaClientDaemons();
-
-  static std::unique_ptr<RdmaClientDaemons> daemons_;
-
- public:
-  static RdmaClientDaemons* get() {
-    std::call_once(RdmaClientDaemons::initDataFlag_,
-                   &RdmaClientDaemons::getInstance);
-
-    return daemons_.get();
-  }
-
-  struct sxi_socket* selectDaemon() {
-    int cpu = curCpu_;
-    curCpu_ = (curCpu_ + 1) % onlineCpus_;
-
-    LOG(INFO) << "select daemon " << cpu << "onlineCpus_ " << onlineCpus_;
-    return rdmaClientSocket_[cpu];
-  }
-
-  ~RdmaClientDaemons();
-
- public:
-  friend class SocketClient;
-
- private:
-  static std::once_flag initDataFlag_;
-  static void getInstance() {
-    if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
-  }
-
-  std::vector<struct sxi_socket*> rdmaClientSocket_;
-  std::atomic<int> curCpu_;
-  int onlineCpus_;
-};
-
-/**
- * @brief management for client connection which are from trainers
- *
- * @note  it contains one channel descriptor which used to write and
- *        read data
- */
-class SocketClient {
- public:
-  SocketClient(const std::string& serverAddr,
-               int serverPort,
-               enum ChannelType channelType);
-
-  SocketChannel* getChannel() { return channel_.get(); }
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-  struct sxi_socket* socketDaemon_;
-  enum ChannelType tcpRdma_;
-
- private:
-  void RdmaClient(const std::string& serverAddr, int serverPort);
-  void TcpClient(const std::string& serverAddr, int serverPort);
-};
-
-std::string getIpAddr(std::string& device);
-void setOption(int sockfd);
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterClient2.cpp b/paddle/legacy/pserver/ParameterClient2.cpp
deleted file mode 100644
index 264faa791..000000000
--- a/paddle/legacy/pserver/ParameterClient2.cpp
+++ /dev/null
@@ -1,781 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-
-#include "ParameterClient2.h"
-#include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
-
-namespace paddle {
-
-template <typename T1, typename T2>
-void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
-                         const T2* src,
-                         size_t size) {
-  dest->Clear();
-  dest->Reserve(size);
-  for (size_t i = 0; i < size; ++i) {
-    dest->AddAlreadyReserved(src[i]);
-  }
-}
-
-ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
-    : BaseClient(separate, numPorts), port_(port) {
-#ifndef PADDLE_DISABLE_TIMER
-  forwardbackwordTime_ = 0;
-#endif
-}
-
-int ParameterClient2::calcParameterBlockSize(
-    const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
-  size_t totalSize = 0;
-  for (auto& para : parameters) {
-    totalSize += para->getSize();
-  }
-  size_t perServerSize = totalSize / serviceNum;
-
-  int sizeBits = 64 - __builtin_clzl(perServerSize);
-
-  /// 2^10 is min block size
-  /// 2^7 will be max number of blocks in one pserver
-  int blockSizeBits = std::max((sizeBits - 7), 10);
-  return 1 << blockSizeBits;
-}
-
-void ParameterClient2::initThreads() {
-  threadNum_ = serviceNum_;
-  if (FLAGS_parallel_thread_num > 1) {
-    LOG(INFO) << "parallel_thread_num dosent need to set";
-  }
-  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
-  startThreads();
-}
-
-bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
-  destroy();
-
-  std::vector<std::string> hosts;
-  str::split(FLAGS_pservers, ',', &hosts);
-  serviceNum_ = hosts.size() * numPorts_;
-  uint64_t denseBlockSize = calcParameterBlockSize(parameters, serviceNum_);
-
-  /// setup prefetch matrix if exists
-  for (auto& para : parameters) {
-    /// set block size for each parameter
-    para->getConfig().set_parameter_block_size(
-        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
-                                                 : denseBlockSize);
-  }
-
-  for (auto& para : parameters) {
-    CHECK_NE(-1UL, para->getID()) << "id in parameter is not initialized";
-    parameterMap_[para->getID()] = para;
-  }
-
-  allSegments_.reserve(parameters.size());
-
-  for (auto& para : parameters) {
-    ParameterSegments segments;
-    segments.name = para->getName();
-    segments.id = para->getID();
-    allSegments_.push_back(segments);
-    if (para->getConfig().sparse_remote_update()) {
-      CHECK_EQ(para->getConfig().parameter_block_size(),
-               para->getConfig().dims(1))
-          << "For sparse remote update parameter,"
-          << " block size is the width of each row.";
-    }
-  }
-
-  /// init clients
-  clients_.reserve(serviceNum_);
-  recvDataMems_.resize(serviceNum_);
-
-  for (size_t i = 0; i < hosts.size(); ++i) {
-    for (int j = 0; j < numPorts_; ++j) {
-      LOG(INFO) << "pserver " << i * numPorts_ + j << " " << hosts[i] << ":"
-                << port_ + j;
-      if (FLAGS_rdma_tcp == "rdma") {
-        clients_.emplace_back(hosts[i], port_ + j, F_RDMA);
-      } else {
-        clients_.emplace_back(hosts[i], port_ + j, F_TCP);
-      }
-    }
-  }
-
-  sparseDistribution_.reset(new SparseParameterDistribution(serviceNum_));
-
-  sleep(2);
-
-  initThreads();
-
-  return true;
-}
-
-ParameterClient2::~ParameterClient2() { destroy(); }
-
-void ParameterClient2::destroy() {
-  if (clients_.empty()) {
-    /// this means not initialized.
-    return;
-  }
-  finishThreads();
-
-  parameterMap_.clear();
-  allSegments_.clear();
-  clients_.clear();
-}
-
-void ParameterClient2::sendParallel(int tid,
-                                    size_t numThreads,
-                                    ParameterType recvParameterType) {
-  int numMyClients = divup(serviceNum_ - tid, numThreads);
-
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_send");
-    int i = numThreads * j + tid;
-    /// Try to make different clients to send data to different pservers
-    /// at the same time so that they will not flood data to the same
-    /// pserver.
-    i = calcClientId(i, serviceNum_);
-    clients_[i].send("sendParameter",
-                     sendJob_.parallelRequests[i],
-                     sendJob_.parallelInputIovs[i]);
-
-    /// clear large structure
-    sendJob_.parallelRequests[i].Clear();
-    sendJob_.parallelInputIovs[i].clear();
-  }
-
-  std::vector<void*> bufs;
-  SendParameterResponse response;
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_recv");
-    int i = numThreads * j + tid;
-    i = calcClientId(i, serviceNum_);
-    auto msgReader = clients_[i].recv(&response);
-    CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-    bufs.clear();
-    bufs.reserve(response.blocks_size());
-    for (auto& block : response.blocks()) {
-      auto it = parameterMap_.find(block.para_id());
-      CHECK(it != parameterMap_.end());
-      Parameter* parameter = it->second.get();
-      real* buf = nullptr;
-      if (parameter->getBuf(recvParameterType)) {
-        buf = parameter->getBuf(recvParameterType)->getPoint(block.begin_pos());
-      } else {
-        auto recvMat = dynamic_cast<SparseRowCpuMatrix*>(
-            parameter->getMat(recvParameterType).get());
-        CHECK(recvMat);
-        size_t width = parameter->getConfig().dims(1);
-        // TODO(wuyi): need add lock here? may also cause resize.
-        buf = recvMat->getLocalRow(block.begin_pos() / width);
-      }
-      /// sparse_id is not useful while receiving data since sparse data
-      /// storage is continuous, do commit recieved data as that of dense.
-      bufs.push_back(buf);
-    }
-    msgReader->readBlocks(bufs);
-  }
-}
-
-void ParameterClient2::prepareSendData(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    BatchStatus batchStatus,
-    SendJob* sendJob) {
-  sendJob->parallelRequests.resize(serviceNum_);
-  sendJob->parallelInputIovs.resize(serviceNum_);
-
-  for (auto& request : sendJob->parallelRequests) {
-#ifndef PADDLE_DISABLE_TIMER
-    if (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT) {
-      request.set_forwardbackward_time(forwardbackwordTime_);
-    }
-#endif
-    request.set_trainer_id(trainerId_);
-    request.set_update_mode(updateMode);
-    request.set_send_back_parameter(sendBackParameter);
-    request.set_send_back_parameter_type(sendBackParameterType);
-    request.set_num_samples(numSamples);
-    request.set_cost(cost);
-    request.set_batch_status(batchStatus);
-    CHECK_EQ(request.blocks_size(), 0);
-    VLOG(1) << "request: trainer_id: " << request.trainer_id() << " update_mode"
-            << request.update_mode()
-            << " send_back_parameter: " << request.send_back_parameter()
-            << " send_back_parameter_type: "
-            << request.send_back_parameter_type()
-            << " num_samples: " << request.num_samples()
-            << " cost: " << request.cost()
-            << " batch_status: " << request.batch_status();
-  }
-  for (const auto& segments : parameterSegments) {
-    const auto it = parameterMap_.find(segments.id);
-    CHECK(it != parameterMap_.end());
-    Parameter* parameter = it->second.get();
-    CHECK(parameter != nullptr) << "parameter is nullptr";
-    int64_t nameHash = std::hash<std::string>()(segments.name);
-    bool sendingPara = !(updateMode == PSERVER_UPDATE_MODE_GET_PARAM ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE ||
-                         updateMode == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    bool sparseUpdate = parameter->getConfig().sparse_remote_update() &&
-                        (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT ||
-                         updateMode == PSERVER_UPDATE_MODE_ASYNC_SGD ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE);
-
-    const auto blockSize = parameter->getConfig().parameter_block_size();
-    CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
-    const auto paraSize = parameter->getSize();
-    if (sparseUpdate) {
-      auto prefetchMat = std::dynamic_pointer_cast<SparsePrefetchRowCpuMatrix>(
-          parameter->getMat(PARAMETER_VALUE));
-      CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
-      auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
-          parameter->getMat(parameterType).get());
-      CHECK(sendMat != nullptr) << "sendMat is nullptr";
-
-      syncThreadPool_->exec([&](int tid, size_t numThreads) {
-        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
-        const auto& localIndices = prefetchMat->getLocalIndices();
-        /// num of sparse rows
-        size_t nLocalBlocks = localIndices.size();
-        uint64_t beginDim = 0;
-        uint64_t endDim = 0;
-
-        // HACK(typhoonzero): let it resize first
-        prefetchMat->getLocalRow(nLocalBlocks);
-        sendMat->getLocalRow(nLocalBlocks);
-
-        for (size_t row = 0; row < nLocalBlocks; ++row) {
-          int64_t blockId = localIndices[row];  // local row -> sparse row
-          int serverId = std::abs((blockId + nameHash) % serviceNum_);
-          if (serverId % numThreads != (size_t)tid) {
-            continue;
-          }
-
-          beginDim = blockId * blockSize;
-          endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-
-          auto& request = sendJob->parallelRequests[serverId];
-          ParameterBlock* block = request.add_blocks();
-          block->set_para_id(segments.id);
-          /// global sparse row id
-          block->set_block_id(blockId);
-          /// local row offset
-          block->set_begin_pos(row * blockSize);
-          /// block len
-          block->set_block_size(endDim - beginDim);
-          if (sendingPara) {
-            sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
-            /// detect sparse parameter distribution
-            sparseDistribution_->probeDistribution(serverId,
-                                                   sizeof(real) * blockSize);
-          }
-        }
-      });
-
-    } else {  /// parameter set for dense and sparse
-      real* buf =
-          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
-      uint64_t endDim = 0;
-      for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
-        endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-        int64_t blockId = beginDim / blockSize;
-        int serverId = std::abs((blockId + nameHash) % serviceNum_);
-
-        auto& request = sendJob->parallelRequests[serverId];
-        ParameterBlock* block = request.add_blocks();
-        block->set_para_id(segments.id);
-        block->set_block_id(blockId);
-        block->set_begin_pos(beginDim);
-        block->set_block_size(endDim - beginDim);
-        if (buf) {
-          sendJob->parallelInputIovs[serverId].push_back(
-              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
-        }
-      }
-    }
-  }  // parameterSegments
-
-  sparseDistribution_->checkAndResetDistribution();
-}
-
-void ParameterClient2::sendAndReceiveParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    ParameterType recvParameterType) {
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  sendBackParameterType,
-                  /*batchStatus = */ BATCH_START_AND_FINISH,
-                  &sendJob_);
-
-  syncThreadPool_->exec([&](int tid, size_t numThreads) {
-    this->sendParallel(tid, numThreads, recvParameterType);
-  });
-}
-
-void ParameterClient2::sendParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    BatchStatus batchStatus) {
-  SendJobPtr sendJob = std::make_shared<SendJob>();
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  PARAMETER_VALUE,
-                  batchStatus,
-                  sendJob.get());
-
-  for (int i = 0; i < threadNum_; i++) {
-    sendJobQueue_[i]->enqueue(sendJob);
-  }
-}
-
-void ParameterClient2::recvParameter() { recvSyncBarrier_->wait(); }
-
-void ParameterClient2::send(int threadId) {
-  int index = threadId;
-  LOG(INFO) << "send thread " << threadId << " started";
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    SendJobPtr recvJob = sendJobQueue_[index]->dequeue();
-    if (stopping_) {
-      recvJobQueue_[index]->enqueue(recvJob);
-      break;
-    }
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_send");
-      int i = threadNum_ * j + index;
-      /// Try to make different clients to send data to different pservers
-      /// at the same time so that they will not flood data to the same
-      /// pserver.
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        clients_[i].send("sendParameter",
-                         recvJob->parallelRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      } else {
-        clients_[i].send("sendData",
-                         recvJob->parallelDataRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      }
-    }
-    recvJobQueue_[index]->enqueue(recvJob);
-  }
-}
-
-void ParameterClient2::recv(int threadId) {
-  LOG(INFO) << "recv thread " << threadId << " started";
-  int index = threadId;
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    std::vector<void*> bufs;
-    SendParameterResponse response;
-    SendDataResponse dataResponse;
-    SendJobPtr recvJob = recvJobQueue_[index]->dequeue();
-    if (stopping_) break;
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_recv");
-      int i = threadNum_ * j + index;
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        auto msgReader = clients_[i].recv(&response);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-        bufs.clear();
-        bufs.reserve(response.blocks_size());
-        for (auto& block : response.blocks()) {
-          auto it = parameterMap_.find(block.para_id());
-          CHECK(it != parameterMap_.end());
-          Parameter* parameter = it->second.get();
-          real* buf =
-              parameter->getBuf(PARAMETER_VALUE)->getPoint(block.begin_pos());
-          CHECK_EQ(msgReader->getBlockLength(bufs.size()),
-                   sizeof(real) * (block.block_size()));
-          bufs.push_back(buf);
-        }
-        msgReader->readBlocks(bufs);
-      } else {
-        auto msgReader = clients_[i].recv(&dataResponse);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)dataResponse.blocks_size());
-        size_t totalLen = msgReader->getTotalLength();
-        if (0 == totalLen) {
-          continue;
-        }
-        auto& recvMem = recvDataMems_[dataResponse.server_id()];
-        CHECK_EQ(dataResponse.blocks_size(), 1)
-            << "Only one block currently support now!";
-        auto& block = dataResponse.blocks(0);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        recvMem = std::make_shared<CpuMemoryHandle>(totalLen);
-        msgReader->readNextBlock(recvMem.get()->getBuf());
-      }
-    }
-    recvSyncBarrier_->wait();
-  }
-}
-
-void ParameterClient2::waitPassStart() {
-  WaitPassStartRequest request;
-  std::vector<WaitPassStartResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitPassFinish() {
-  WaitPassFinishRequest request;
-  std::vector<WaitPassFinishResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::synchronize(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::asyncFinishPass(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  request.set_trainer_id(trainerId_);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::setConfig(const OptimizationConfig& optConfig,
-                                 const std::string& saveDir,
-                                 bool isSparseServer) {
-  SetConfigRequest request;
-  std::vector<SetConfigResponse> responses;
-
-  for (auto& nameAndPara : parameterMap_) {
-    *request.add_param_configs() = nameAndPara.second->getConfig();
-  }
-
-  *request.mutable_opt_config() = optConfig;
-  request.set_save_dir(saveDir);
-  request.set_is_sparse_server(isSparseServer);
-
-  std::vector<SetConfigRequest> requests;
-  requests.resize(clients_.size());
-  for (size_t i = 0; i < requests.size(); ++i) {
-    requests[i].CopyFrom(request);
-    requests[i].set_server_id(i);
-  }
-
-  responses.resize(clients_.size());
-  size_t numClients = clients_.size();
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].send(__func__, requests[i]);
-  }
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].recv(&responses[i]);
-  }
-}
-
-bool ParameterClient2::inStatus(PServerStatus status) {
-  GetStatusRequest request;
-  std::vector<GetStatusResponse> responses;
-
-  bool ok = true;
-  multiCall("getStatus", request, &responses);
-  for (auto& response : responses) {
-    if (response.status() != status) {
-      ok = false;
-    }
-  }
-
-  return ok;
-}
-
-void ParameterClient2::setStatus(PServerStatus status) {
-  SetStatusRequest request;
-  request.set_status(status);
-  std::vector<SetStatusResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitForStatus(PServerStatus status) {
-  while (!inStatus(status)) {
-    sleep(1);
-  }
-}
-
-template <typename Proto>
-static void validateResponses(const std::vector<Proto>& responses) {
-  for (auto& response : responses) {
-    CHECK(response.return_message().empty())
-        << "client" << &response - &responses[0]
-        << " error:" << response.return_message();
-  }
-}
-
-PServerVector ParameterClient2::createVector() {
-  CreateVectorRequest request;
-  std::vector<CreateVectorResponse> responses;
-  int64_t handle = -1;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerVector{handle};
-}
-
-void ParameterClient2::releaseVector(PServerVector handle) {
-  ReleaseVectorRequest request;
-  std::vector<ReleaseVectorResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-PServerMatrix ParameterClient2::createMatrix(int32_t numCols) {
-  CreateMatrixRequest request;
-  std::vector<CreateMatrixResponse> responses;
-  int64_t handle = -1;
-
-  request.set_num_cols(numCols);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerMatrix{handle};
-}
-
-void ParameterClient2::releaseMatrix(PServerMatrix handle) {
-  ReleaseMatrixRequest request;
-  std::vector<ReleaseMatrixResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuVectorPtr vec) {
-  ProtoVector& pvec = *op->add_vectors();
-  size_t dim = vec->getSize();
-  pvec.set_dim(dim);
-  copyToRepeatedField(pvec.mutable_values(), vec->getData(), vec->getSize());
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
-  ProtoMatrix& pmat = *op->add_matrices();
-  pmat.set_num_cols(mat->getWidth());
-  pmat.set_num_rows(mat->getHeight());
-  copyToRepeatedField(
-      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
-}
-
-static inline real addTwo(real a, double b) { return a + b; }
-
-void ParameterClient2::doOperation(PreparedOperations& ops,
-                                   bool waitForGradient,
-                                   bool sendBackGradient,
-                                   bool releasePass) {
-  std::vector<DoOperationResponse> responses;
-  ops.request_.set_wait_for_gradient(waitForGradient);
-  ops.request_.set_send_back_parameter(sendBackGradient);
-  ops.request_.set_release_pass(releasePass);
-  multiCall(__func__, ops.request_, &responses);
-  validateResponses(responses);
-  size_t numPassFinishServers = 0;
-
-  size_t numOps = ops.request_.operations_size();
-  for (auto& response : responses) {
-    numPassFinishServers += response.pass_finish();
-    CHECK_EQ(numOps, (size_t)response.results_size());
-    for (size_t opId = 0; opId < numOps; ++opId) {
-      const OperationResult& result = response.results(opId);
-      std::vector<real*>& resultScalars = ops.localResults_[opId].resultScalars;
-      std::vector<CpuVectorPtr>& resultVectors =
-          ops.localResults_[opId].resultVectors;
-      std::vector<CpuMatrixPtr>& resultMatrices =
-          ops.localResults_[opId].resultMatrices;
-
-      if (&response == &responses[0]) {
-        /// Initialize results to zero
-
-        resultScalars.resize(result.scalars_size());
-        for (auto p : resultScalars) {
-          if (!p) continue;
-          *p = 0;
-        }
-        size_t numVectors = result.vectors_size();
-        resultVectors.resize(numVectors);
-        for (size_t i = 0; i < numVectors; ++i) {
-          if (!resultVectors[i]) continue;
-          resultVectors[i]->resize(result.vectors(i).dim());
-          resultVectors[i]->zeroMem();
-        }
-        size_t numMatrices = result.matrices_size();
-        resultMatrices.resize(numMatrices);
-        for (size_t i = 0; i < numMatrices; ++i) {
-          if (!resultMatrices[i]) continue;
-          resultMatrices[i]->resize(result.matrices(i).num_rows(),
-                                    result.matrices(i).num_cols());
-          resultMatrices[i]->zeroMem();
-        }
-      }
-
-      // aggregate results from each pserver to results
-
-      CHECK_EQ(resultScalars.size(), (size_t)result.scalars_size());
-      for (ssize_t i = 0; i < result.scalars_size(); ++i) {
-        real* rscalar = resultScalars[i];
-        if (!rscalar) continue;
-        *rscalar += result.scalars(i);
-      }
-
-      CHECK_EQ(resultVectors.size(), (size_t)result.vectors_size());
-      for (auto& vec : result.vectors()) {
-        int i = &vec - &result.vectors(0);
-        CpuVectorPtr rvec = resultVectors[i];
-        if (!rvec) continue;
-        CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
-        std::transform(rvec->getData(),
-                       rvec->getData() + rvec->getSize(),
-                       vec.values().data(),
-                       rvec->getData(),
-                       addTwo);
-      }
-
-      CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
-      for (auto& mat : result.matrices()) {
-        int i = &mat - &result.matrices(0);
-        CpuMatrixPtr rmat = resultMatrices[i];
-        if (!rmat) continue;
-        CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
-        CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
-
-        std::transform(rmat->getData(),
-                       rmat->getData() + rmat->getElementCnt(),
-                       mat.values().data(),
-                       rmat->getData(),
-                       addTwo);
-      }
-    }
-  }
-  passFinish_ = numPassFinishServers == clients_.size();
-}
-
-real ParameterClient2::vectorDotProduct(PServerVector u, PServerVector v) {
-  real result = 0.0;
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_utv, u, v)(&result);
-  doOperation(ops, false, false);
-  return result;
-}
-
-void ParameterClient2::vectorScale(PServerVector u, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au, u, a);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorCopy(PServerVector src, PServerVector dst) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_COPY, src, dst);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)1);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMultInto(PServerVector u,
-                                         PServerVector v,
-                                         PServerVector w,
-                                         real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorScaleInto(PServerVector u,
-                                       PServerVector v,
-                                       real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::loadValueVector(const std::string& dirName) {
-  LoadValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<LoadValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void ParameterClient2::saveValueVector(const std::string& dirName) {
-  SaveValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<SaveValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterClient2.h b/paddle/legacy/pserver/ParameterClient2.h
deleted file mode 100644
index 9320e19c4..000000000
--- a/paddle/legacy/pserver/ParameterClient2.h
+++ /dev/null
@@ -1,602 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <mutex>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/pserver/BaseClient.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Queue.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-#include "SparseParameterDistribution.h"
-
-DECLARE_int32(parallel_thread_num);
-
-namespace paddle {
-
-struct PServerMatrix {
-  int64_t handle;
-};
-
-struct PServerVector {
-  int64_t handle;
-};
-
-/**
- * @brief A class to help to prepare server-side operations.
- */
-class PreparedOperations {
- protected:
-  class ResultsAdder;
-  struct LocalOperationResult;
-
- public:
-  /**
-   * Offers an easy way to prepare operations that will be performed on
-   * server-side.
-   *
-   * Usage:
-   * @code
-   *   addOperation(optype, arguments...)(results...)
-   * @endcode
-   *
-   * Examples:
-   * 1. set pserver vector to 1:
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   addOperation(PSERVER_OP_RESET, u, (real)1);
-   * @endcode
-   *
-   * 2. Compute inner product of to pserver vectors.
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   PServerVector v = parameterClient.createVector();
-   *   real result;
-   *   addOperation(PSERVER_OP_utv, u, v)(&result)
-   * @endcode
-   *
-   * @param[in] operation The operation that pserver will perform.
-   * @param[in] args Argument list of the operation
-   * @return A ResultsAdder object initialized with the last element of
-   *         localResults_.
-   */
-  template <typename... Args>
-  ResultsAdder addOperation(MatrixVectorOperation operation, Args... args) {
-    Operation* op = request_.add_operations();
-    op->set_operation(operation);
-    localResults_.emplace_back();
-    addOperationHelper(op, args...);
-    return ResultsAdder(&localResults_.back());
-  }
-
- protected:
-  void addOperationHelper(Operation* op) {}
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerVector
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerVector arg) {
-    op->add_pvectors(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerMatrix
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerMatrix arg) {
-    op->add_pmatrices(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a real valued
-   *        scalar as an operand.
-   */
-  void addOperationHelper(Operation* op, real arg) { op->add_scalars(arg); }
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuVectorPtr
-   *        as an operand.
-   * @note The array of CpuVectors that arg points to will be copied to
-   *       op's vectors field.
-   */
-  void addOperationHelper(Operation* op, CpuVectorPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuMatrixPtr
-   *        as an operand.
-   * @note The array of CpuMatrixs that arg points to will be copied to
-   *       op's matrices field.
-   */
-  void addOperationHelper(Operation* op, CpuMatrixPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation and prepare the operands.
-   *
-   * @tparam Arg An operand of the operation.
-   * @tparam Args A list of rest operands of the operation.
-   * @param op Pointer to an Operation object.
-   */
-  template <typename Arg, typename... Args>
-  void addOperationHelper(Operation* op, Arg arg, Args... args) {
-    addOperationHelper(op, arg);
-    addOperationHelper(op, args...);
-  }
-
-  /**
-   * @brief ResultsAdder offers easy ways to quickly store operation results.
-   */
-  class ResultsAdder {
-   public:
-    explicit ResultsAdder(LocalOperationResult* localResult)
-        : localResult_(localResult) {}
-    template <typename... Args>
-    void operator()(Args... args) {
-      addResult(args...);
-    }
-    void addResult() {}
-    void addResult(real* arg) { localResult_->resultScalars.push_back(arg); }
-    void AddResult(CpuVectorPtr arg) {
-      localResult_->resultVectors.push_back(arg);
-    }
-    void AddResult(CpuMatrixPtr arg) {
-      localResult_->resultMatrices.push_back(arg);
-    }
-    template <typename Arg, typename... Args>
-    void addResult(Arg arg, Args... args) {
-      addResult(arg);
-      addResult(args...);
-    }
-
-   protected:
-    LocalOperationResult* localResult_;
-  };
-
- protected:
-  DoOperationRequest request_;
-  std::vector<iovec> inputIovs_;
-  struct LocalOperationResult {
-    std::vector<real*> resultScalars;
-    std::vector<CpuVectorPtr> resultVectors;
-    std::vector<CpuMatrixPtr> resultMatrices;
-  };
-  std::vector<LocalOperationResult> localResults_;
-  friend class ParameterClient2;
-};
-
-struct ParameterSegments {
-  std::string name;  // name of the parameter
-  size_t id;         // id of the parameter
-};
-
-/**
- * The client interface for parameter server. ParameterClient2 supports 2 modes
- * for managing connections to parameter servers, in the 1st mode one connection
- * is shared by 2 threads that are separately responsible for sending and
- * recieving activities, in the 2nd mode one connection is owned by only one
- * thread, and all the sending and recieving activities run in that single
- * thread.
- * TODO(yanfei):
- * Additional core idea to further optimizate pserver performance is
- * to do sync-sgd based parameter level instead of pserver level.
- * full-parallelization based parameter level for sync-sgd also can
- * sense forwardbackward computation layer-by-layer for more deeper layer
- * model.
- * Firstly, pserver can do full-parallelization on all computation based
- * parameter level instead of waiting for all gradients are finished and
- * start to send back parameters value immediately if parameter is ready
- * instead of waiting for all parameters value are ready
- * Secondly, parameter client can write back parameters to GPU instead of
- * waiting until all parameters are received to CPU host end.
- */
-class ParameterClient2 : public BaseClient {
- public:
-  /** Constructor.
-   * @param separate True if sending and recieving activities are separated
-   *                 into 2 threads, otherwise false.
-   * @param port Port number that parameter client runs on.
-   * @param numPorts Number of ports parameter clients occupies,
-   *                 numPorts * pserver number is the total number of
-   *                 connections the parameter client maintains.
-   */
-  ParameterClient2(bool separate = false,
-                   int port = FLAGS_port,
-                   int numPorts = FLAGS_ports_num);
-
-  ~ParameterClient2();
-
-  static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
-                                    size_t serviceNum);
-
- public:
-  bool init(const std::vector<ParameterPtr>& parameters);
-
-  /// service functions
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers, then receives
-   *        the response from the servers.
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] sendBackParameterType Send back parameter type on pserver,
-   *            PARAMETER_VALUE by default
-   * @param[in] recvParameterType pserver[sendBackParameterType] will be copy to
-   *            client[recvParameterType]
-   * @note Only parameterType will be sent.
-   */
-  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
-                               ParameterType parameterType,
-                               const std::vector<ParameterSegments>& segments,
-                               int64_t numSamples,
-                               real cost,
-                               bool sendBackParameter,
-                               ParameterType sendBackParameterType,
-                               ParameterType recvParameterType);
-
-  /**
-   * @brief Sends all parameters to parameter servers, and receives the response
-   *        from the servers.
-   */
-  void sendAndReceiveParameter(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType = PARAMETER_VALUE,
-      ParameterType recvParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(updateMode,
-                            parameterType,
-                            allSegments_,
-                            numSamples,
-                            cost,
-                            sendBackParameter,
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers. Each
-   *        sendParameter() must be paired with a recvParameter() in the future.
-   *        Only parameterType will be sent.
-   *
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] batchStatus Status of the batch.
-   * @note This function is non-blocking. This means that parameter should
-   *       not change between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     const std::vector<ParameterSegments>& segments,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus);
-
-  void recvParameter();
-
-  /**
-   * Sends all parameters to parameter servers, recvParameter() have to be
-   * invoked
-   * afterwards.
-   *
-   * @note This function is non-blocking. This means that if parameter should
-   *       not changes between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus) {
-    sendParameter(updateMode,
-                  parameterType,
-                  allSegments_,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  batchStatus);
-  }
-
-  /// Get all parameters from parameter servers
-  void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
-                    ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Get parameters by sparse row ids from parameter servers
-  void getParameterSparse(
-      ParameterType recvParameterType = PARAMETER_VALUE,
-      ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM_SPARSE,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Set all parameters on parameter servers using the local parameters
-  void setParameter() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-  /**
-   * Set all parameters on parameter servers, values will be zero
-   * means do not sending local parameters
-   */
-  void setParameterZero() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-
-  /**
-   * @brief Wait until all gradient servers start one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd"
-   *       algorithm. Calling this function means that the calling gradient
-   *       server is ready to start a new pass.
-   */
-  void waitPassStart();
-
-  /**
-   * @brief Wait until all gradient servers finish one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd" algorithm.
-   *       Calling this function means that the calling gradient server
-   *       finishes one pass.
-   */
-  void waitPassFinish();
-
-  /// Wait until all gradient servers call this function.
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /// Called when async-sgd finish pass.
-  void asyncFinishPass(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  void asyncStartPass(SyncObject syncObjectId = SYNC_DEFAULT) {
-    return synchronize(syncObjectId);
-  }
-
-  /**
-   * @brief Execute the prepared operations on pservers, fetch the results and
-   *        aggregate results from different pservers.
-   * @param[in] ops Prepared operations that will be executed on pservers.
-   * @param[in] waitForGradient If true, wait for gradient to be ready before
-   *            starting the operations.
-   * @param[in] sendBackParameter If true, send back the parameter to clients
-   *            after the operations are finished.
-   * @param[in] If true, and if all clients call waitPassFinish, signal all
-   *            clients finish the pass.
-   */
-  void doOperation(PreparedOperations& ops,
-                   bool waitForGradient,
-                   bool sendBackParameter,
-                   bool releasePass = true);
-
-  /**
-   * Set the configuration of pserver, including parameter config and
-   * optimization config
-   */
-  void setConfig(const OptimizationConfig& optConfig,
-                 const std::string& saveDir = "",
-                 bool isSparseServer = false);
-
-  /// Return true if all pservers are in the given status
-  bool inStatus(PServerStatus status);
-  bool isPassFinish() { return passFinish_; }
-
-  /// Set pserver status
-  void setStatus(PServerStatus status);
-
-  /**
-   * @brief Wait until all pservers are at status
-   * @note This function is not suitable for frequent use,
-   *       because it sleeps 1 second each time when condition is satisfied.
-   */
-  void waitForStatus(PServerStatus status);
-
-  /// Create a column vector. The size is the dimension of parameter.
-  PServerVector createVector();
-
-  /// Release the PServerVector given handle.
-  void releaseVector(PServerVector handle);
-
-  /**
-   * Create a column major matrix. The number of rows is the dimension of
-   * parameter. The number of columns is specifed by numCols.
-   */
-  PServerMatrix createMatrix(int32_t numCols);
-
-  /// Release the PServerMatrix given handle.
-  void releaseMatrix(PServerMatrix handle);
-
-  // Some basic algebra functions
-  /// Calculate the dot product of u and v
-  real vectorDotProduct(PServerVector u, PServerVector v);
-
-  /// Scale u by a
-  void vectorScale(PServerVector u, real a);
-
-  /// Copy from src to dest
-  void vectorCopy(PServerVector src, PServerVector dst);
-
-  /// u += v * a
-  void vectorAddMult(PServerVector u, PServerVector v, real a);
-
-  /// u = v + w * a
-  void vectorAddMultInto(PServerVector u,
-                         PServerVector v,
-                         PServerVector w,
-                         real a);
-  /// u = v * a
-  void vectorScaleInto(PServerVector u, PServerVector v, real a);
-
-  /// Return pserver parameter value.
-  PServerVector getPServerParameterValue() {
-    PServerVector vec;
-    vec.handle = PARAMETER_VALUE;
-    return vec;
-  }
-
-  /// Return pserver parameter gradient.
-  PServerVector getPServerParameterGradient() {
-    PServerVector vec;
-    vec.handle = PARAMETER_GRADIENT;
-    return vec;
-  }
-
-  /**
-   * Tell pservers to load value vector from file.
-   *
-   * @param[in] dirName The directory that contains the value vector file.
-   */
-  void loadValueVector(const std::string& dirName);
-
-  /// Tell pservers to save value vector to file.
-  void saveValueVector(const std::string& dirName);
-
-  void setTrainerId(int trainerId) { trainerId_ = trainerId; }
-
-#ifndef PADDLE_DISABLE_TIMER
-  void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
-#endif
-
- protected:
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
- private:
-  void destroy();
-
-  /**
-   * @brief management function for parallelizing send/recv all connections
-   *        to all pservers. it is called under one SyncThreadPool. it
-   *        supports to use N thread to control M connections. the receiving
-   *        actions can be started until all sending action to all connections
-   *        owned by current thread are finished. Different connections
-   * controlled
-   *        by different threads can transfer data asynchronously.
-   */
-  void sendParallel(int tid,
-                    size_t numThreads,
-                    ParameterType recvParameterType);
-  /// sending thread routine for asynchronously send data
-  void send(int threadId);
-  /// receiving thread routing for asynchronously receive data
-  void recv(int threadId);
-
-  /**
-   * @brief main routine to build data for pserver
-   *
-   * @note  it can prepare different kinds of parameter type data. it can
-   *        be regarded as layer for bridging real parameters data and
-   *        protobuf data for communication.
-   *        TODO(yanfei):
-   *        can abstract additional layer to encode and decode data to/from
-   *        protobuf data.
-   */
-  void prepareSendData(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,  // client send type
-      const std::vector<ParameterSegments>& parameterSegments,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType,  // send back type in pserver
-      BatchStatus batchStatus,
-      SendJob* sendJob);
-
-  /// start necessary threads for threadPool
-  void initThreads();
-
- protected:
-  /// start port number of pserver
-  /// it deduce all ports for dense and sparse with some rules
-  int port_;
-  /// identify the trainer id using this client
-  int trainerId_;
-
-#ifndef PADDLE_DISABLE_TIMER
-  uint64_t forwardbackwordTime_;
-#endif
-  std::mutex sparseAutoGrowthMutex_;
-
-  /// map id to parameter used for decoding protobuf data
-  std::unordered_map<size_t, ParameterPtr> parameterMap_;
-  /// segments for all parameters that needed to sync
-  std::vector<ParameterSegments> allSegments_;
-
-  /// module for sensing sparse parameters distribution on all pservers
-  std::unique_ptr<SparseParameterDistribution> sparseDistribution_;
-
-  /// thread pool for parallelizing all connections to pservers
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  bool passFinish_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.cpp b/paddle/legacy/pserver/ParameterServer2.cpp
deleted file mode 100644
index 8533a322d..000000000
--- a/paddle/legacy/pserver/ParameterServer2.cpp
+++ /dev/null
@@ -1,1401 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterServer2.h"
-
-#include <algorithm>
-#include <fstream>
-
-#include "paddle/legacy/math/SIMDFunctions.h"
-#include "paddle/legacy/parameter/AverageOptimizer.h"
-#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
-#include "paddle/legacy/parameter/OptimizerFunctions.h"
-#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
-#include "paddle/legacy/parameter/ParameterOptimizer.h"
-#include "paddle/legacy/parameter/ParameterUpdateFunctions.h"
-#include "paddle/legacy/parameter/Regularizer.h"
-#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-DEFINE_double(async_lagged_ratio_min,
-              1.0,
-              "control config_.async_lagged_grad_discard_ratio() min value");
-DEFINE_double(
-    async_lagged_ratio_default,
-    1.5,
-    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
-    "use it as defalut value");
-
-namespace paddle {
-
-const std::string ParameterServer2::kRetMsgInvalidMatrixHandle =
-    "Invalid matrix handle";
-const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
-    "Invalid vector handle";
-const std::string ParameterServer2::kRetMsgUnknownOperation =
-    "Unknown operation";
-
-ParameterServer2::ParameterServer2(const std::string& addr,
-                                   int port,
-                                   int rdmaCpu)
-    : ProtoServer(addr, port, rdmaCpu),
-      dataSize_(0),
-      size_(0),
-      gradientReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      parameterReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      passBarrier_(FLAGS_num_gradient_servers + 1),
-      numPassFinishClients_(0),
-      allClientPassFinish_(false),
-      serverId_(-1),
-      batchId_(-1) {
-  /**
-   * register function for remote client calling, these functions
-   * will be mapped to a data structure for quick looking up. each
-   * request from trainer can contains one function name to indicate
-   * remote action. this architecture looks like rpc style for pserver.
-   */
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, getStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, doOperation);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassStart);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassFinish);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, synchronize);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, asyncFinishPass);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, loadValueVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, saveValueVector);
-
-  /// thread pool for parallelizing some computations
-  if (FLAGS_pserver_num_threads > 1) {
-    syncThreadPool_.reset(new SyncThreadPool(FLAGS_pserver_num_threads, false));
-  }
-}
-
-bool ParameterServer2::init() {
-  vectors_.resize(NUM_PARAMETER_TYPES);
-  configMap_.clear();
-
-  numSamplesProcessed_ = 0;
-  cost_ = 0;
-  char* mpienv = getenv("OMPI_COMM_WORLD_SIZE");
-  if (mpienv != NULL) {
-    mpiSize_ = atoi(mpienv);
-  } else {
-    mpiSize_ = 1;
-  }
-  status_ = PSERVER_STATUS_NOT_SET;
-  dataMems_.resize(FLAGS_num_gradient_servers);
-  synchronizeBarriers_.resize(SyncObject_ARRAYSIZE);
-  for (auto& barrier : synchronizeBarriers_) {
-    barrier.reset(new ThreadBarrier(FLAGS_num_gradient_servers));
-  }
-
-  // initialization for dicarding lagging gradient
-  asyncUpdateSteps_ = 0;
-  asyncTrainerSteps_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
-  asyncLaggedGradientsNum_ = 0;
-  asyncUpdateStat_.resize(static_cast<int>(FLAGS_num_gradient_servers *
-                                           FLAGS_async_lagged_ratio_default));
-  asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
-  asyncTrainerDiscardStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
-  asyncTrainerCommitStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
-
-  return true;
-}
-
-void ParameterServer2::getStatus(const GetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  (void)request;
-  GetStatusResponse response;
-  response.set_status(status_);
-  callback(response);
-}
-
-void ParameterServer2::setStatus(const SetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  status_ = request.status();
-  SetStatusResponse response;
-  callback(response);
-}
-
-void ParameterServer2::setConfig(const SetConfigRequest& request,
-                                 ProtoResponseCallback callback) {
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-
-    serverId_ = request.server_id();
-    isSparseServer_ = request.is_sparse_server();
-
-    if (!request.save_dir().empty()) {
-      mkDir(request.save_dir().c_str());
-    }
-
-    for (const auto& config : request.param_configs()) {
-      CHECK(!configMap_.count(config.para_id()))
-          << "Duplicated parameter name: " << config.name();
-      configMap_[config.para_id()] = config;
-      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
-    }
-
-    config_ = request.opt_config();
-    if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
-      auto asyncLaggedRatio = config_.async_lagged_grad_discard_ratio();
-      if (asyncLaggedRatio <= FLAGS_async_lagged_ratio_min) {
-        LOG(INFO) << "WARNING: async_lagged_grad_discard_ratio is too small"
-                  << "reset to default, async_lagged_grad_discard_ratio = "
-                  << FLAGS_async_lagged_ratio_default;
-        asyncLaggedRatio = FLAGS_async_lagged_ratio_default;
-      }
-      asyncLaggedThreshold_ =
-          static_cast<int64_t>(FLAGS_num_gradient_servers * asyncLaggedRatio);
-      LOG(INFO) << "discard lagged async gradient ratio: " << asyncLaggedRatio
-                << " asyncLaggedhreshold: " << asyncLaggedThreshold_;
-    }
-    if (isSparseServer_ && config_.num_batches_per_send_parameter() > 1) {
-      /// sparse server must NOT use local update mode
-      config_.set_num_batches_per_send_parameter(1);
-    }
-
-    if (config_.num_batches_per_send_parameter() > 1 &&
-        config_.center_parameter_update_method() == "average") {
-      /// scaling L1/L2 decay rate as large as L1/L2 apply in trainer
-      /// if parameter regularization in pserver
-      for (auto& pair : configMap_) {
-        ParameterConfig& config = pair.second;
-        if (config_.num_batches_per_send_parameter() ==
-            config.num_batches_regularization()) {
-          real scale =
-              config_.delta_add_rate() * config.num_batches_regularization();
-          if (config_.algorithm() == "sgd") {
-            scale *= FLAGS_num_gradient_servers;
-          }
-          config.set_decay_rate(config.decay_rate() * scale);
-          if (config.decay_rate() > 0.1f) {
-            LOG(FATAL) << "L2 decay=" << config.decay_rate()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-          config.set_decay_rate_l1(config.decay_rate_l1() * scale);
-          if (config.decay_rate_l1() > 0.1f) {
-            LOG(FATAL) << "L1 decay=" << config.decay_rate_l1()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-
-          LOG(INFO) << "parameter:" << config.name()
-                    << " decay apply in pserver,"
-                    << " L1 decay=" << config.decay_rate_l1()
-                    << " L2 decay=" << config.decay_rate();
-        }
-      }
-    }
-  }
-
-  SetConfigResponse response;
-  callback(response);
-}
-
-real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
-  real sum = 0;
-  for (const auto buffer : buffers) {
-    for (size_t i = 0; i < buffer.size; ++i) {
-      sum += buffer.base[i];
-    }
-  }
-  return sum;
-}
-
-void ParameterServer2::mergeSegments(BlockSegments* segments) {
-  if (segments->empty()) {
-    return;
-  }
-  std::sort(segments->begin(), segments->end());
-  auto curr = segments->begin();
-  for (auto it = segments->begin(); it != segments->end(); ++it) {
-    if (it->first <= curr->second) {
-      curr->second = std::max(curr->second, it->second);
-    } else {
-      ++curr;
-      *curr = *it;
-    }
-  }
-  ++curr;
-  segments->erase(curr, segments->end());
-}
-
-void ParameterServer2::setParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)response;
-  (void)outputBuffers;
-  LOG(INFO) << "pserver: setParameter";
-  std::lock_guard<RWLock> guard(parameterMutex_);
-
-  int64_t numBlocks = blockIdMap_.size();
-  CHECK_EQ(blockIdMap_.size(), blockOffsetMap_.size());
-  /// total bytes for all the added blocks
-  int64_t totalSize = size_;
-  std::vector<int64_t> offsets;
-  offsets.reserve(request.blocks_size());
-  std::vector<int64_t> blockIds;
-  blockIds.reserve(request.blocks_size());
-  int bufferIndex = 0;
-
-  if (!request.blocks().size()) {
-    LOG(WARNING)
-        << "--ports_num or --ports_num_for_sparse might be too large, "
-        << "or total dense parameter size or sparse parameters size "
-        << "might be too small, this psever doesn't store any parameter.";
-    return;
-  }
-
-  for (const auto& block : request.blocks()) {
-    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
-    uint64_t blockSize = getParameterConfig(block).parameter_block_size();
-    BlockKey key(block.para_id(), block.block_id());
-    if (inputBuffers.size()) {  // if !=PSERVER_UPDATE_MODE_SET_PARAM_ZERO
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-      CHECK_EQ(buffer.size, block.block_size())
-          << "data size is too big:"
-          << " block_size=" << block.block_size()
-          << " data_size=" << buffer.size;
-    }
-
-    /// add a new block
-    if (blockIdMap_.count(key) == 0) {
-      blockOffsetMap_[key] = totalSize;
-      blockIdMap_[key] = numBlocks;
-      ++numBlocks;
-      totalSize += blockSize;
-    }
-    offsets.push_back(blockOffsetMap_[key]);
-    blockIds.push_back(blockIdMap_[key]);
-  }
-
-  size_ = totalSize;
-  LOG(INFO) << "pserver: new cpuvector: size=" << size_;
-  if (!vectors_[PARAMETER_VALUE]) {
-    /// vectors_
-    const auto types = sgdOptimizerGetTypes(config_, true /*inPserver*/);
-    for (const auto type : types) {
-      vectors_[type].reset(new CpuVector(size_));
-      vectors_[type]->zeroMem();
-    }
-
-    blockInfos_.resize(numBlocks);
-    for (auto& info : blockInfos_) {
-      info.lock.reset(new std::mutex());
-    }
-  } else {
-    CHECK_EQ((size_t)size_, vectors_[PARAMETER_VALUE]->getSize())
-        << "Currently adding new blocks is not supported. "
-        << "All blocks must be added in one setParameter call";
-  }
-
-  VectorPtr buf = vectors_[PARAMETER_VALUE];
-  usedSegments_.reserve(offsets.size());
-  /// if offsets is empty, means parameter_block_size is too big or too many
-  /// nodes.
-  if (offsets.empty()) {
-    LOG(WARNING) << "in setParameter: offsets is empty";
-  }
-  for (size_t i = 0; i < offsets.size(); ++i) {
-    size_t blockId = blockIds[i];
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(request.blocks(i));
-    info.config = &config;
-    info.offset = offsets[i];
-    info.optimizer.reset(sgdOptimizerCreate(
-        config_, config, config.sparse_remote_update(), true /*inPserver*/));
-    if (config.sparse_remote_update()) {
-      size_t width = config.dims(1);
-      CHECK_EQ(config.parameter_block_size(), width)
-          << "block size: " << config.parameter_block_size()
-          << "width : " << width;
-    }
-    info.optimizer->init(1, info.config);
-    usedSegments_.push_back(std::make_pair(
-        offsets[i], offsets[i] + request.blocks(i).block_size()));
-  }
-  mergeSegments(&usedSegments_);
-
-  if (request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM) {
-    /// copy param from trainer
-    for (size_t i = 0; i < offsets.size(); ++i) {
-      Buffer buffer = inputBuffers[i];
-      real* start = buf->getPoint(offsets[i]);
-      CHECK_LE(offsets[i] + buffer.size, buf->getSize());
-      memcpy(start, buffer.base, sizeof(real) * buffer.size);
-    }
-  } else {
-    CHECK(request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    /// nothing to do, value vector zero mem already
-  }
-}
-
-void ParameterServer2::addGradient(const SendParameterRequest& request,
-                                   std::vector<Buffer>& inputBuffers,
-                                   SendParameterResponse* response,
-                                   std::vector<Buffer>* outputBuffers) {
-  VLOG(1) << "pserver: addGradient";
-
-  {
-    ReadLockGuard guard(parameterMutex_);
-    int bufferIndex = 0;
-    for (const auto& block : request.blocks()) {
-      int64_t offset = getBlockOffset(block);
-      CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                          << " id=" << block.para_id()
-                          << " block id=" << block.block_id();
-
-      int64_t blockId = getBlockId(block);
-      CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                           << " id=" << block.para_id()
-                           << " block id=" << block.block_id();
-
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-
-      const real* gradientBuffer = buffer.base;
-      real* gradientSumBuffer = vectors_[PARAMETER_GRADIENT]->getPoint(offset);
-
-      size_t size = buffer.size;
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      if (config.sparse_remote_update()) {
-        CHECK_EQ(size, config.parameter_block_size());
-      } else {  // dense
-        CHECK_LE(size, config.parameter_block_size());
-      }
-      std::lock_guard<std::mutex> guard(*info.lock);
-      simd::addTo(gradientSumBuffer, gradientBuffer, size);
-    }
-  }
-  if (request.batch_status() == BATCH_FINISH ||
-      request.batch_status() == BATCH_START_AND_FINISH) {
-    numSamplesProcessed_ += request.num_samples();
-    cost_ += request.cost();
-    VLOG(1) << "num samples: " << numSamplesProcessed_
-            << ", new cost:" << cost_;
-
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-    VLOG(1) << "start send back";
-  }
-}
-
-bool ParameterServer2::asyncGrdientCommitCheckAndStat(
-    const SendParameterRequest& request) {
-  const auto trainerId = request.trainer_id();
-  int64_t trainerSteps = asyncTrainerSteps_[trainerId];
-  CHECK_GE(asyncUpdateSteps_, trainerSteps)
-      << " async update steps overflows "
-      << " trainer id: " << trainerId
-      << " async update steps in pserver: " << asyncUpdateSteps_
-      << " async update steps in request: " << trainerSteps;
-
-  asyncUpdateSteps_++;
-  bool commitGradient = true;
-
-  int64_t delta = asyncUpdateSteps_ - trainerSteps;
-  if (delta >= asyncLaggedThreshold_) {
-    VLOG(1) << "discard Async Update: "
-            << " trainer id: " << trainerId
-            << " pserver steps: " << asyncUpdateSteps_
-            << " request steps: " << trainerSteps;
-    asyncLaggedGradientsNum_++;
-    commitGradient = false;
-  }
-  /// stat on lagged steps, to get total discard distribution
-  if (static_cast<size_t>(delta) < asyncUpdateStat_.size()) {
-    asyncUpdateStat_[delta]++;
-  } else {
-    asyncUpdateStat_[asyncUpdateStat_.size() - 1]++;
-  }
-  /// stat on trainerId and discard, to get trainer condition
-  if (commitGradient) {
-    asyncTrainerCommitStat_[trainerId]++;
-  } else {
-    asyncTrainerDiscardStat_[trainerId]++;
-  }
-
-  return commitGradient;
-}
-
-static ThreadLocal<std::vector<bool>> localBlockBitset_;
-
-void ParameterServer2::asyncSGD(const SendParameterRequest& request,
-                                std::vector<Buffer>& inputBuffers,
-                                SendParameterResponse* response,
-                                std::vector<Buffer>* outputBuffers) {
-  int64_t numBlocks = blockIdMap_.size();
-  auto& localBlockBitset = *localBlockBitset_;
-
-  if (isSparseServer_) {
-    if (localBlockBitset.empty()) {
-      localBlockBitset.resize(numBlocks);
-    }
-    localBlockBitset.assign(numBlocks, false);
-  }
-
-  ReadLockGuard guard(parameterMutex_);
-
-  if (request.send_back_parameter()) {
-    outputBuffers->reserve(request.blocks_size());
-  }
-
-  bool commitGradient = asyncGrdientCommitCheckAndStat(request);
-
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-  size_t bufferIndex = 0;
-  for (const auto& block : request.blocks()) {
-    int64_t offset = getBlockOffset(block);
-    CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                        << " id=" << block.para_id()
-                        << " block id=" << block.block_id();
-    int64_t blockId = getBlockId(block);
-    CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                         << " id=" << block.para_id()
-                         << " block id=" << block.block_id();
-    Buffer buffer = inputBuffers[bufferIndex];
-    ++bufferIndex;
-
-    size_t size = buffer.size;
-
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-
-    std::lock_guard<std::mutex> guard(*info.lock);
-    /// gradients are too obsolete, will be discarded
-    if (commitGradient) {
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      vecs[PARAMETER_GRADIENT]->subVecFrom(buffer.base, 0, size);
-      info.optimizer->update(vecs, config, isSparseServer_ ? 0 : -1);
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-
-    if (commitGradient && isSparseServer_) {
-      localBlockBitset[blockId] = true;
-    }
-
-    if (!isSparseServer_ && request.send_back_parameter()) {  // dense
-      int type = request.send_back_parameter_type();
-      sendBackParameter(block, type, response, &buffer, outputBuffers);
-    }
-  }  /// foreach block
-
-  asyncTrainerSteps_[request.trainer_id()] = asyncUpdateSteps_;
-
-  if (commitGradient && isSparseServer_) {
-    /// find blocks that trainer do not request update
-    for (int64_t blockId = 0; blockId < numBlocks; ++blockId) {
-      if (localBlockBitset[blockId]) {
-        continue;
-      }
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = *info.config;
-      size_t size = config.parameter_block_size();
-
-      std::lock_guard<std::mutex> guard(*info.lock);
-      info.optimizer->startBatch(numSamplesProcessed_);
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, info.offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-  }
-
-  if (commitGradient && (request.batch_status() == BATCH_FINISH ||
-                         request.batch_status() == BATCH_START_AND_FINISH)) {
-    numSamplesProcessed_ += request.num_samples();
-  }
-
-  /// show some performance log if needed
-  if (request.trainer_id() == 0) {
-    /// batchId_ is approximately equal to "real batchId_"
-    batchId_++;
-  }
-}
-
-void ParameterServer2::getParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  LOG(INFO) << "pserver: getParameter";
-  ReadLockGuard guard(parameterMutex_);
-  for (const auto& block : request.blocks()) {
-    int type = request.send_back_parameter_type();
-    sendBackParameter(block, type, response, outputBuffers);
-  }
-}
-
-void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
-                                          std::vector<Buffer>& inputBuffers,
-                                          SendParameterResponse* response,
-                                          std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  auto& buffer = *readWriteBuffer_;
-  size_t numReals = 0;
-  for (const auto& block : request.blocks()) {
-    numReals += getParameterConfig(block).dims(1);
-  }
-  buffer.resize(numReals);
-
-  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
-
-  ReadLockGuard guard(parameterMutex_);
-  size_t offset = 0;
-  for (const auto& block : request.blocks()) {
-    size_t width = getParameterConfig(block).dims(1);
-    Buffer buf = {buffer.data() + offset, width};
-    int type = request.send_back_parameter_type();
-    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
-    offset += width;
-  }
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         Buffer* buffer,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  size_t size = buffer->size;
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  /// copy to second buffer to avoid to be polluted by other request
-  memcpy(buffer->base, valueBuffer, sizeof(real) * size);
-  outputBuffers->push_back({buffer->base, size});
-}
-
-void ParameterServer2::sendBackParameterSparse(
-    const ParameterBlock& block,
-    int parameterType,
-    SendParameterResponse* response,
-    Buffer* buffer,
-    size_t width,
-    std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  CHECK_EQ(buffer->size, width);
-  memcpy(buffer->base, valueBuffer, width * sizeof(real));
-  outputBuffers->push_back(*buffer);
-}
-
-void ParameterServer2::readAllBlocks(
-    MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
-  auto& buffer = *readWriteBuffer_;
-  size_t numBlocks = msgReader->getNumBlocks();
-  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
-                              numBlocks);
-  std::vector<void*> bufs(numBlocks);
-  buffers->clear();
-  buffers->reserve(numBlocks);
-  buffer.resetAlignAlloc();
-  for (size_t i = 0; i < numBlocks; ++i) {
-    size_t len = msgReader->getBlockLength(i);
-    CHECK_EQ(len % sizeof(real), (size_t)0);
-    size_t size = len / sizeof(real);
-    bufs[i] = buffer.nextBlock(size);
-    buffers->push_back({(real*)bufs[i], size});
-  }
-  msgReader->readBlocks(bufs);
-}
-
-void ParameterServer2::sendParameter(const SendParameterRequest& request,
-                                     std::unique_ptr<MsgReader> msgReader,
-                                     ProtoResponseCallbackEx callback) {
-  SendParameterResponse response;
-  std::vector<Buffer> inputBuffers;
-  std::vector<Buffer> outputBuffers;
-  readAllBlocks(msgReader.get(), &inputBuffers);
-  msgReader.reset();
-
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-      setParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-      getParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-      getParameterSparse(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-      asyncSGD(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      addGradient(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      break;
-  }
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      (*requestVec_).push_back(request);
-      (*callbackVec_).push_back(callback);
-      if (request.batch_status() == BATCH_FINISH ||
-          request.batch_status() == BATCH_START_AND_FINISH) {
-        for (size_t i = 0; i < (*requestVec_).size(); i++) {
-          ReadLockGuard guard(parameterMutex_);
-          SendParameterRequest& request = (*requestVec_)[i];
-          SendParameterResponse responseTemp;
-
-          std::vector<iovec> outputIovs;
-          if (request.send_back_parameter()) {
-            CHECK(!isSparseServer_);
-            std::vector<Buffer> outputBuffersTemp;
-            for (const auto& block : request.blocks()) {
-              int type = request.send_back_parameter_type();
-              sendBackParameter(block, type, &responseTemp, &outputBuffersTemp);
-            }
-            outputIovs.reserve(outputBuffersTemp.size());
-            for (auto buffer : outputBuffersTemp) {
-              outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-            }
-          }
-
-          ProtoResponseCallbackEx& callbackTemp = (*callbackVec_)[i];
-          callbackTemp(responseTemp, outputIovs);
-        }
-        (*requestVec_).clear();
-        (*callbackVec_).clear();
-      }
-      break;
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      std::vector<iovec> outputIovs;
-      outputIovs.reserve(outputBuffers.size());
-      for (auto buffer : outputBuffers) {
-        outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-      }
-      callback(response, outputIovs);
-      break;
-  }
-}
-
-template <typename Dtype>
-void ParameterServer2::reduceAndSendData(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  auto sendData = reinterpret_cast<Dtype*>(dataMems_[0].get()->getBuf());
-  size_t rawMemSize = dataMems_[0].get()->getSize();
-  CHECK_EQ(rawMemSize % sizeof(Dtype), 0U);
-  size_t dataMemSize = rawMemSize / sizeof(Dtype);
-  for (size_t i = 1; i < dataMems_.size(); ++i) {
-    CHECK_EQ(dataMems_[i].get()->getSize(), rawMemSize);
-    auto data = reinterpret_cast<Dtype*>(dataMems_[i].get()->getBuf());
-    for (size_t j = 0; j < dataMemSize; ++j) {
-      sendData[j] += data[j];
-    }
-  }
-  std::vector<iovec> outputIovs;
-  auto block = response.add_blocks();
-  outputIovs.push_back({sendData, rawMemSize});
-  block->set_total_size(rawMemSize);
-  block->set_data_size(sizeof(Dtype));
-  callback(response, outputIovs);
-}
-
-void ParameterServer2::templateReduceSum(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  const auto& block = request.blocks(0);
-  switch (block.data_type()) {
-    case TRANS_FLOAT:
-      reduceAndSendData<float>(request, msgReader, callback);
-      break;
-    case TRANS_DOUBLE:
-      reduceAndSendData<double>(request, msgReader, callback);
-      break;
-    case TRANS_INT32:
-      reduceAndSendData<int>(request, msgReader, callback);
-      break;
-    case TRANS_UINT32_T:
-      reduceAndSendData<uint32_t>(request, msgReader, callback);
-      break;
-    case TRANS_INT64_T:
-      reduceAndSendData<int64_t>(request, msgReader, callback);
-      break;
-    case TRANS_UINT64_T:
-      reduceAndSendData<uint64_t>(request, msgReader, callback);
-      break;
-    default:
-      LOG(FATAL) << "not supported";
-      break;
-  }
-}
-
-void ParameterServer2::sendData(const SendDataRequest& request,
-                                std::unique_ptr<MsgReader> msgReader,
-                                ProtoResponseCallbackEx callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  switch (request.update_mode()) {
-    case DATA_UPDATE_MODE_SET_OWN: {
-      CHECK_EQ(msgReader->getNumBlocks(), (size_t)(request.blocks_size()));
-      size_t totalLen = msgReader->getTotalLength();
-      if (totalLen > 0) {
-        CHECK_EQ(msgReader->getNumBlocks(), 1U)
-            << "Only one block currently support now!";
-        const auto& block = request.blocks(0);
-        if (0 == dataSize_) {
-          dataSize_ = block.data_size();
-        } else {
-          CHECK_EQ(dataSize_, block.data_size());
-        }
-        int64_t serverId = request.server_id();
-        if (serverId_ < 0) {
-          serverId_ = serverId;
-        } else {
-          CHECK_EQ(serverId_, serverId);
-        }
-        int64_t clientId = request.client_id();
-        dataMems_[clientId] = std::make_shared<CpuMemoryHandle>(totalLen);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        msgReader->readNextBlock(dataMems_[clientId].get()->getBuf());
-      }
-      msgReader.reset();
-      std::vector<iovec> outputIovs;
-      callback(response, outputIovs);
-      break;
-    }
-    case DATA_UPDATE_MODE_GET_ALL: {
-      /// Currently only support DATA_REDUCE_SUM
-      /// And their Operations are just add
-      CHECK(DATA_REDUCE_SUM == request.type());
-      templateReduceSum(request, msgReader, callback);
-      break;
-    }
-    default: { LOG(FATAL) << "not supported"; }
-  }
-}
-
-void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
-  real* data = vec->getData();
-  if (usedSegments_.empty()) {
-    return;
-  }
-  memset(data, 0, sizeof(real) * usedSegments_[0].first);
-  memset(data + usedSegments_.back().second,
-         0,
-         sizeof(real) * (size_ - usedSegments_.back().second));
-  size_t n = size_ - usedSegments_.back().second;
-
-  for (size_t i = 1; i < usedSegments_.size(); ++i) {
-    memset(
-        data + usedSegments_[i - 1].second,
-        0,
-        sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
-    n += usedSegments_[i].first - usedSegments_[i - 1].second;
-  }
-}
-
-void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
-  SyncThreadPool::execHelper(
-      syncThreadPool_.get(), [&](int tid, size_t numThreads) {
-        int64_t numBlocks = blockIdMap_.size();
-        VectorPtr* vecs = parameter::getThreadLocalBuffer();
-        for (int64_t blockId = tid; blockId < numBlocks;
-             blockId += numThreads) {
-          func(blockId, vecs);
-        }
-      });
-}
-
-void ParameterServer2::blockTraverse(
-    BlockInfo& info,
-    const ParameterConfig& config,
-    int64_t offset,
-    size_t size,
-    const VectorPtr vecs[],
-    const ParameterOptimizer::TraverseCallback& callback) {
-  /// setup sub bufs
-  for (const auto type : info.optimizer->getParameterTypes()) {
-    vecs[type]->subVecFrom(*vectors_[type], offset, size);
-  }
-  callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-}
-
-void ParameterServer2::op_SGD(const Operation& operation,
-                              OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  if (allClientPassFinish_) {
-    /// when all clients signal pass finished, the update
-    /// is empty.
-    return;
-  }
-
-  {
-    parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      int64_t offset = info.offset;
-      size_t size = config.parameter_block_size();
-
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      info.optimizer->update(
-          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    });
-  }
-
-  batchId_++;
-}
-
-void ParameterServer2::op_start_pass(const Operation& operation,
-                                     OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    info.optimizer->startPass();
-  });
-}
-
-void ParameterServer2::op_finish_pass(const Operation& operation,
-                                      OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    /// catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, info.offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    /// finish pass
-    info.optimizer->finishPass();
-  });
-  batchId_ = 0;
-}
-
-void ParameterServer2::op_apply(const Operation& operation,
-                                OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    int64_t offset = info.offset;
-    size_t size = config.parameter_block_size();
-
-    // catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    // apply to PARAMETER_APPLY
-    if (auto callback = info.optimizer->apply()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-    }
-  });
-}
-
-void ParameterServer2::op_randomize(const Operation& operation,
-                                    OperationResult* result) {
-  LOG(INFO) << "ParameterServer2::op_randomize: serverId=" << serverId_;
-
-  CpuVector& valueVec = *vectors_[PARAMETER_VALUE];
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    vecs[PARAMETER_VALUE]->subVecFrom(valueVec, info.offset, size);
-    Parameter::randomize(vecs[PARAMETER_VALUE], config);
-  });
-}
-
-void ParameterServer2::loadValueVector(const LoadValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  LoadValueResponse response;
-  LOG(INFO) << "ParameterServer2::loadValueVector: serverId=" << serverId_;
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ifstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameters in pserver";
-  CHECK(Parameter::isHeaderFormatSupported(header.format))
-      << "Incorrect format version: " << header.format;
-  CHECK_EQ(header.size, (size_t)size_)
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << size_ << ") of the pserver: " << serverId_;
-  CHECK_EQ(header.valueSize, sizeof(real)) << "Unsupported valueSize "
-                                           << header.valueSize;
-  CHECK(fs.read(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)));
-
-  callback(response);
-}
-
-void ParameterServer2::saveValueVector(const SaveValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  SaveValueResponse response;
-  LOG(INFO) << "ParameterServer2::SaveValueVector: serverId=" << serverId_;
-
-  mkDir(request.dir_name().c_str());
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
-                                             : *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  // TODO(TJ): save param headerFormat_
-  header.format = PARAM_FORMAT_ORIGINAL;
-  header.valueSize = sizeof(real);
-  header.size = size_;
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(fs.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  CHECK(fs.write(reinterpret_cast<char*>(vec.getData()),
-                 header.size * sizeof(real)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  callback(response);
-}
-
-void ParameterServer2::op_RESET(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  CpuVector* u = vectors_[operation.pvectors(0)].get();
-  u->reset(operation.scalars(0));
-  clearUnusedSegments(u);
-}
-
-void ParameterServer2::op_utv(const Operation& operation,
-                              OperationResult* result) {
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum += (double)u[i] * (double)v[i];
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_au_bv(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = a * u[i] + b * v[i];
-  }
-}
-
-void ParameterServer2::op_COPY(const Operation& operation,
-                               OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = u[i];
-  }
-}
-
-void ParameterServer2::op_au(const Operation& operation,
-                             OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    u[i] *= a;
-  }
-}
-
-void ParameterServer2::op_au_bv_cw(const Operation& operation,
-                                   OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  real* w = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  real c = operation.scalars(2);
-  for (int64_t i = 0; i < size; ++i) {
-    w[i] = a * u[i] + b * v[i] + c * w[i];
-  }
-}
-
-void ParameterServer2::op_make_steepest_desc_dir(const Operation& operation,
-                                                 OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] < 0) {
-      dir[i] = -grad[i] + l1weight;
-    } else if (x[i] > 0) {
-      dir[i] = -grad[i] - l1weight;
-    } else {
-      if (grad[i] < -l1weight) {
-        dir[i] = -grad[i] - l1weight;
-      } else if (grad[i] > l1weight) {
-        dir[i] = -grad[i] + l1weight;
-      } else {
-        dir[i] = 0;
-      }
-    }
-  }
-}
-
-void ParameterServer2::op_fix_dir_signs(const Operation& operation,
-                                        OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* steepestDescDir = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] * steepestDescDir[i] <= 0) {
-      dir[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_fix_omega_signs(const Operation& operation,
-                                          OperationResult* result) {
-  (void)result;
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newx = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] * newx[i] < 0) {
-      newx[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_dir_deriv(const Operation& operation,
-                                    OperationResult* result) {
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] != 0) {
-      if (x[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (x[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      } else if (dir[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (dir[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      }
-    }
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_cost(const Operation& operation,
-                               OperationResult* result) {
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newgrad = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  real l2weight = operation.scalars(1);
-  double cost_real = cost_ / mpiSize_;
-  double sum_weight_l1 = 0;
-  double sum_weight_l2 = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum_weight_l1 += std::abs(x[i]);
-    sum_weight_l2 += x[i] * x[i];
-    newgrad[i] += 2.0 * l2weight * x[i];
-  }
-  cost_real += l1weight * sum_weight_l1 + l2weight * sum_weight_l2;
-  result->add_scalars(cost_real);
-}
-
-ParameterServer2::OperatorFunction ParameterServer2::opFuncs[] = {
-    nullptr,                         // PSERVER_OP_utu = 0;
-    &ParameterServer2::op_utv,       // PSERVER_OP_utv = 1;
-    &ParameterServer2::op_au,        // PSERVER_OP_au = 2;
-    &ParameterServer2::op_au_bv,     // PSERVER_OP_au_bv = 3;
-    nullptr,                         // PSERVER_OP_aAx_bu = 4;
-    &ParameterServer2::op_SGD,       // PSERVER_OP_SGD = 5;
-    &ParameterServer2::op_RESET,     // PSERVER_OP_RESET = 6;
-    &ParameterServer2::op_COPY,      // PSERVER_OP_COPY = 7;
-    &ParameterServer2::op_au_bv_cw,  // PSERVER_OP_au_bv_cw = 8;
-    &ParameterServer2::op_make_steepest_desc_dir,
-    /// PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
-    &ParameterServer2::op_fix_dir_signs,    // PSERVER_OP_FIX_SIGNS = 10;
-    &ParameterServer2::op_dir_deriv,        // PSERVER_OP_DIR_DERIV = 11;
-    &ParameterServer2::op_fix_omega_signs,  // PSERVER_OP_FIX_OMEGA_SIGNS = 12;
-    &ParameterServer2::op_cost,             // PSERVER_OP_COST = 13
-    &ParameterServer2::op_start_pass,       // PSERVER_OP_START_PASS = 14
-    &ParameterServer2::op_finish_pass,      // PSERVER_OP_FINISH_PASS = 15
-    &ParameterServer2::op_randomize,        // PSERVER_OP_RANDOMIZE = 16
-    &ParameterServer2::op_apply,            // PSERVER_OP_APPLY = 17
-};
-
-void ParameterServer2::doOperation(const DoOperationRequest& request,
-                                   ProtoResponseCallback callback) {
-  if (request.wait_for_gradient()) {
-    /// wait gradient update
-    gradientReadyBarrier_.wait();
-    allClientPassFinish_ = numPassFinishClients_ == FLAGS_num_gradient_servers;
-  }
-
-  DoOperationResponse response;
-  response.set_pass_finish(allClientPassFinish_);
-
-  for (const auto& op : request.operations()) {
-    OperationResult* opResult = response.add_results();
-    if (op.operation() >= ARRAYSIZE(opFuncs)) {
-      LOG(ERROR) << "Unknown operation " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    OperatorFunction opFunc = opFuncs[op.operation()];
-    if (!opFunc) {
-      LOG(ERROR) << "Operation not implemented: " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    (this->*opFunc)(op, opResult);
-  }
-
-  if (request.send_back_parameter()) {
-    /// clean current cost
-    cost_ = 0;
-
-    if (allClientPassFinish_ && request.release_pass()) {
-      /// This signals that all clients finish one pass, so waitPassFinish()
-      /// will stop waiting.
-      numPassFinishClients_ = 0;
-    }
-
-    /// notify addGradient() to send back parameter
-    parameterReadyBarrier_.wait();
-  }
-  callback(response);
-}
-
-void ParameterServer2::waitPassStart(const WaitPassStartRequest& request,
-                                     ProtoResponseCallback callback) {
-  passBarrier_.wait();
-  callback(WaitPassStartResponse());
-}
-
-void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
-                                      ProtoResponseCallback callback) {
-  numPassFinishClients_ += 1;
-
-  while (numPassFinishClients_ != 0) {
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-  }
-
-  callback(WaitPassFinishResponse());
-}
-
-void ParameterServer2::synchronize(const SynchronizeRequest& request,
-                                   ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  dataSize_ = 0;
-  callback(SynchronizeResponse());
-}
-
-void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
-                                       ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  callback(SynchronizeResponse());
-
-  if (request.trainer_id() == 0) {
-    batchId_ = 0;
-  }
-}
-
-void ParameterServer2::createVector(const CreateVectorRequest& request,
-                                    ProtoResponseCallback callback) {
-  (void)request;
-  CreateVectorResponse response;
-  LOG(INFO) << "ParameterServer2::createVector: size=" << size_;
-  CpuVectorPtr vec = std::make_shared<CpuVector>(size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = vectors_.size();
-    vectors_.push_back(vec);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseVector(const ReleaseVectorRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseVectorResponse response;
-  CpuVectorPtr vec;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    vec.swap(vectors_[request.handle()]);
-  }
-  callback(response);
-}
-
-void ParameterServer2::createMatrix(const CreateMatrixRequest& request,
-                                    ProtoResponseCallback callback) {
-  CreateMatrixResponse response;
-  /// We need to create column major matrix of size_ * num_cols
-  /// Matrix is row majoar. Need to tranpose when use it.
-  CpuMatrixPtr mat = std::make_shared<CpuMatrix>(request.num_cols(), size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = matrices_.size();
-    matrices_.push_back(mat);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseMatrixResponse response;
-  CpuMatrixPtr mat;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    mat.swap(matrices_[request.handle()]);
-  }
-  callback(response);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.h b/paddle/legacy/pserver/ParameterServer2.h
deleted file mode 100644
index 069e730ea..000000000
--- a/paddle/legacy/pserver/ParameterServer2.h
+++ /dev/null
@@ -1,696 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <limits>
-#include <mutex>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include "paddle/legacy/math/Matrix.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/ParameterOptimizer.h"
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/ThreadLocal.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-
-DECLARE_int32(port);
-
-namespace paddle {
-
-// @TODO(yanfei):
-// if armed with high density computation resource per node, pserver could also
-// utilize GPU to reduce overhead. if this mechanism is used, it could pipeline
-// network receiving and GPU computation to reduce the network overhead even
-// further. the pipeline could help to accelerate BIG model training.
-// @TODO:(yanfei)
-// for cpu and less/low gpu machine, the time exhausted by forward and backward
-// could be larger than optimization at pserver. However, if armed with lots of
-// gpus per node and if the model size is so large enough that limited cpu
-// computation causes big optmization latency, the GPU may be required by
-// pserver.
-
-/**
- * Client interface for the parameter server
- *
- * it implements several rpc API for remote parameter client usage.
- * for sync-sgd, client needs one controller thread to build connections
- * to all pservers, these controller connections do barriers
- * synchronization with these connections used for transfering data.
- * each data connection uses block based fine grained synchronization
- * to gain better scalability. Merging gradients from different trainers
- * are concurrently executed with block units, so that some network
- * overhead will be hidden in merging gradient.
- * for async-sgd, the difference is that pserver will do optimization
- * immediately if the gradients are ready, so that pserver needs to
- * prepare separate buffer to store value for sending back to trainer
- * to prevent from being polluted.
- */
-class ParameterServer2 : public ProtoServer {
- protected:
-  /// parameter_ mutex.
-  RWLock parameterMutex_;
-
-  typedef std::pair<size_t, int64_t> BlockKey;
-  struct BlockKeyHash {
-    size_t operator()(const BlockKey& key) const {
-      return std::hash<size_t>()(key.first) + key.second;
-    }
-  };
-
-  // TODO(yanfei):
-  // if index data structure is based on parameters instead of blocks, the
-  // lookup performance could be better. In addition, the block memory
-  // access almost exhibits good locality, so index data structure and
-  // block data structure can be refined further, especially if gpu is used
-  // for pserver.
-  /**
-   * all parameters are stored in CpuVector with a blockMap_ data structure
-   * to index block data required by requests.
-   */
-  typedef std::unordered_map<BlockKey, int64_t, BlockKeyHash> BlockMap;
-  /// <(para, block), global offset(byte) in all parameters>
-  BlockMap blockOffsetMap_;
-  /// <(para, block), global idx [0, nBlocksInAllParameters]>
-  BlockMap blockIdMap_;
-
-  std::vector<CpuVectorPtr> vectors_;
-  std::vector<CpuMatrixPtr> matrices_;
-  std::vector<CpuMemHandlePtr> dataMems_;
-
-  // TODO(yanfei):
-  // if storing sparse_remote_update() flag in request instead of
-  // reading configMap_, and storing config within new block wise
-  // overview data structure, the config mapping, block mapping
-  // can be unified in single clean data structure. Use para_id
-  // to index parameters, use offset to index block within parameter
-  // and keep two index into single one.
-  /**
-   * mapping between parameter and config
-   * different parameter allows different config, such as decay_rate.
-   * for each request, it need to read config for adding gradient
-   * and optmization.
-   */
-  std::unordered_map<size_t, ParameterConfig> configMap_;
-
-  /**
-   * to parallelize the multi-thread and multi-connnection
-   * computation at pserver, it use block unit to reduce
-   * the contention for computation, even further use block
-   * level optimizater control for each block for some special
-   * reason annotated below.
-   */
-  struct BlockInfo {
-    const ParameterConfig* config;
-    std::unique_ptr<std::mutex> lock;
-    /// global offset for all parameters
-    uint64_t offset;
-    /**
-     *
-     * Async sgd in pserver is very different from sync sgd.
-     * Each trainer follows startBatch, update*, finishBatch as in
-     * sync sgd, but all these actions are almost executed by
-     * multi-core and multi-thread simutaneously, so that async
-     * sgd optimization is based on block level in reality, then
-     * per block optimization is necessary indeed. In addition,
-     * per block optimization is also perfered for performance
-     * with multithreads.
-     */
-    std::unique_ptr<ParameterOptimizer> optimizer;
-  };
-  std::vector<BlockInfo> blockInfos_;
-
-  typedef std::vector<std::pair<int64_t, int64_t>> BlockSegments;
-  /// Because some blocks might not be fully used. We keep a
-  /// record of which segments are used.
-  BlockSegments usedSegments_;
-
-  /// record pserver status, all status defined in ParameterService.pb
-  PServerStatus status_;
-  /// record all samples processed which could be used by optimizater
-  std::atomic<int64_t> numSamplesProcessed_;
-  double cost_;
-  int mpiSize_;
-  int dataSize_;
-  /// configuration for current parameter optimizer
-  OptimizationConfig config_;
-
-  /**
-   * The ReadWriteBuffer is based on std::vector, but aligned for avx/sse
-   * compute. And add some helper method to allocate memory aligned blocks.
-   *
-   * @param T          type of element.
-   * @param AlignBytes the memory aligned bytes for allocated blocks.
-   */
-  template <typename T, size_t AlignBytes>
-  class ReadWriteBuffer
-      : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
-   public:
-    static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
-                  "Type T must be able to aligned.");
-
-    /**
-     * @brief IsTLargerThanAlign compiled time calculated constant for is type
-     * T larger than alignments.
-     */
-    constexpr static bool IsTLargerThanAlign = sizeof(T) >= AlignBytes;
-
-    static_assert(std::is_pod<T>::value, "T must be POD type.");
-
-    /**
-     * @brief if AlignBytes > sizeof(T), then will calcuate how many elements
-     * can be stored in AlignBytes.
-     */
-    constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
-
-    static_assert(AlignElementCount ==
-                          (AlignElementCount & -AlignElementCount) ||
-                      AlignBytes > sizeof(T),
-                  "AlignElementCount should be exp of 2");
-
-    /**
-     * @brief Resize Buffer, with block count that will be allocated. Each block
-     * will be memory aligned in AlignBytes.
-     * @param size The element count in all blocks.
-     * @param alignBlockCount The block count that will be allocated.
-     */
-    void resizeWithAlignHints(size_t size, size_t alignBlockCount = 1) {
-      if (IsTLargerThanAlign) {  //! So, each elements is memory aligned.
-        this->resize(size);
-      } else {
-        //! at most, we need such elements in buffer to make sure each block is
-        //! aligned.
-        this->resize(size + alignBlockCount * (AlignElementCount - 1));
-      }
-    }
-
-    /**
-     * @brief reset aligned allocate blocks.
-     */
-    void resetAlignAlloc() { this->curOffset_ = 0; }
-
-    /**
-     * @brief get next aligned block address.
-     * @param blockSize is the element count in each block.
-     * @return Aligned block address.
-     */
-    T* nextBlock(size_t blockSize) {
-      T* r = &this->operator[](curOffset_);
-      curOffset_ += blockSize;
-
-      if (!IsTLargerThanAlign) {
-        curOffset_ =
-            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
-      }
-      return r;
-    }
-
-   private:
-    size_t curOffset_;
-  };
-
-  /// to buffer the data from network for further processing to
-  /// reduce redundant memory allocation.
-  ThreadLocal<ReadWriteBuffer<real, ALIGN_HINT>> readWriteBuffer_;
-
-  /// size of the parameter
-  int64_t size_;
-
-  /// for synchronized training, check details in addGradient()
-  /// and doOperation()
-  ThreadBarrier gradientReadyBarrier_;
-  ThreadBarrier parameterReadyBarrier_;
-  ThreadBarrier passBarrier_;
-  ThreadLocal<std::vector<SendParameterRequest>> requestVec_;
-  ThreadLocal<std::vector<ProtoResponseCallbackEx>> callbackVec_;
-
-  std::atomic<int> numPassFinishClients_;
-  bool allClientPassFinish_;
-
-  std::vector<std::unique_ptr<ThreadBarrier>> synchronizeBarriers_;
-  std::atomic<int> serverId_;
-
-  /**
-   *
-   * for lagged async gradient gradient commit control in Async Sgd.
-   * discard lagged gradients from too slow nodes, whose gradients
-   * exhibits bad quality.
-   * Algorithm:
-   * pserver:
-   * 1. initial asyncUpdaterSteps = 0, asyncTrainerSteps_[N] = 0.
-   * syncUpdaterSteps means
-   *    the version of parameter value.
-   * 2. when pull arrives, record asyncUpdateSteps_ into
-   * syncTrainerSteps_[trainer_id]
-   * 3. when push arrives, compare asyncUpdateSteps_ with
-   * syncTrainerSteps_[trainer_id]
-   *    if delta > threshold, discard current gradient, else commit
-   *    gradient.
-   * 4. reset asyncUpdaterSteps_ and asyncTrainerSteps_[N] when pass
-   * finished
-   * Note:
-   * it can not discard all lag-gradient strictly in some special
-   * condition. part of gradients could be discarded if
-   * ConcurrentRemoteParameterUpdater is sed.
-   * this algorithm is implemented in asynSGD()
-   */
-  int64_t asyncLaggedThreshold_;
-  std::atomic<int64_t> asyncUpdateSteps_;
-  std::vector<int64_t> asyncTrainerSteps_;
-  size_t asyncLaggedGradientsNum_;
-  /// stat all async update
-  std::vector<size_t> asyncUpdateStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerDiscardStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerCommitStat_;
-
-  /// only used by controller and other control cmd from trainer number 0
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  /// pserver for sparse remote update parameters
-  bool isSparseServer_;
-
-  /// barrier performance tuning sync-sgd required
-  std::atomic<int64_t> batchId_;
-
- public:
-  struct Buffer {
-    real* base;
-    size_t size;
-  };
-
- protected:
-  /// async gradient commit control
-  bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
-
- public:
-  /// disable default parameter for overloading
-  /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
-  /// -1 means using TCP transport instead of RDMA
-  ParameterServer2(const std::string& addr, int port, int rdmaCpu = -1);
-
-  ~ParameterServer2() {}
-
-  static const std::string kRetMsgInvalidMatrixHandle;
-  static const std::string kRetMsgInvalidVectorHandle;
-  static const std::string kRetMsgUnknownOperation;
-
-  /// service functions
-  template <typename Dtype>
-  void reduceAndSendData(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  void templateReduceSum(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  /**
-   * @brief framework for sending parameters
-   *
-   * @note  different parameter data type can be sent to pserver.
-   *        in most case, the api is used to send gradients from
-   *        trainer to pserver.
-   *        it also can be used to retrieve parameters from pserver
-   */
-  void sendParameter(const SendParameterRequest& request,
-                     std::unique_ptr<MsgReader> msgReader,
-                     ProtoResponseCallbackEx callback);
-
-  void sendData(const SendDataRequest& request,
-                std::unique_ptr<MsgReader> msgReader,
-                ProtoResponseCallbackEx callback);
-
-  /**
-   * @brief send config to pserver
-   *
-   * @note  it can help pserver to understand the configuration for
-   * optimization,
-   *        logging control, duplicated initialization, etc.
-   */
-  void setConfig(const SetConfigRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief get status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver
-   */
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief set status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver, since parameters
-   *        at pserver are initialized by trainer
-   */
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief framework for doing some operation at pserver end
-   *
-   * @note  if sync-sgd is used, controller will calling op_SGD action
-   *        for gradient optimization.
-   *        check avaiable operations in opFuncs[]
-   */
-  void doOperation(const DoOperationRequest& request,
-                   ProtoResponseCallback callback);
-
-  /// Create a column vector. The size is the dimension of parameter
-  void createVector(const CreateVectorRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseVector(const ReleaseVectorRequest& request,
-                     ProtoResponseCallback callback);
-
-  /// Create a column major matrix. The number of rows is the dimension of
-  /// parameter. The number of columns is specifed by num_cols.
-  void createMatrix(const CreateMatrixRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseMatrix(const ReleaseMatrixRequest& request,
-                     ProtoResponseCallback callback);
-  /**
-   * @brief stateful control for indicationg sync pass start
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassStart(const WaitPassStartRequest& request,
-                     ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicationg sync pass end
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassFinish(const WaitPassFinishRequest& request,
-                      ProtoResponseCallback callback);
-
-  /**
-   * @brief synchronize all distributed trainers
-   *
-   * @note  it's general api for synchronizing trainer and pserver
-   */
-  void synchronize(const SynchronizeRequest& request,
-                   ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicating async pass is finished
-   *
-   * @note  it is valuable for logging control, state reset, etc.
-   */
-  void asyncFinishPass(const SynchronizeRequest& request,
-                       ProtoResponseCallback callback);
-
-  void loadValueVector(const LoadValueRequest& request,
-                       ProtoResponseCallback callback);
-
-  void saveValueVector(const SaveValueRequest& request,
-                       ProtoResponseCallback callback);
-
- public:
-  /**
-   * @brief initialize parameter server
-   */
-  bool init();
-
-  /**
-   * @brief set parameters at pserver
-   *
-   * @note  do parameter initialization if neccessy.
-   */
-  void setParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief receive gradients and do optimization for async-sgd
-   *
-   * @note  this api asynchronizately receives all data from all
-   *        trainers, and immediately do optimization and return
-   *        optimizated value for trainer.
-   *        this above routine are block based atomic updating,
-   *        which means different block could based different stale
-   *        gradient.
-   *        it will discard some lagged gradients by default for
-   *        better convergence.
-   */
-  void asyncSGD(const SendParameterRequest& request,
-                std::vector<Buffer>& inputBuffers,
-                SendParameterResponse* response,
-                std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief merge gradients from all trainer
-   *
-   * @note  this api use block based parallelization as fine grained
-   *        parallelization which benifits lock contention and latency
-   *        hidden for communication, also can harness multi-core
-   *        efficiently.
-   *        it also implements the synchronization for sync-sgd
-   */
-  void addGradient(const SendParameterRequest& request,
-                   std::vector<Buffer>& inputBuffers,
-                   SendParameterResponse* response,
-                   std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get dense parameters from pserver
-   *
-   * @note  for some specified condition, trainer will get parameters from
-   *        pservers.
-   *        e.g.
-   *        if all parameters are stored at perver end for big model training
-   *        trainer can use it to retrieve all parameters if necessary.
-   */
-  void getParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get sparse value from parameter server
-   *
-   * @note  with sparse enabled, pservers own all latest value
-   *        while trainer only retrieve value that only are needed.
-   *        e.g.
-   *        trainer will do prefetch action to retrieve necessary latest
-   *        value from pserver for sparse calculation.
-   */
-  void getParameterSparse(const SendParameterRequest& request,
-                          std::vector<Buffer>& inputBuffers,
-                          SendParameterResponse* response,
-                          std::vector<Buffer>* outputBuffers);
-
- protected:
-  void mergeSegments(BlockSegments* segments);
-
-  /// set the unused segments to zero
-  void clearUnusedSegments(CpuVector* vec);
-
-  // TODO(yanfei):
-  // if read data and do optimization interleavely block by block,
-  // the performance could be better for gaining less network congestion.
-  /// read all data from connection and store it in static pre-allocated buffer
-  void readAllBlocks(MsgReader* msgReader,
-                     std::vector<ParameterServer2::Buffer>* buffers);
-
-  const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
-    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
-                                    << block.para_id();
-    const auto it = configMap_.find(block.para_id());
-    CHECK(it != configMap_.end()) << "can not find parameter id: "
-                                  << block.para_id();
-    return it->second;
-  }
-
-  /// it implictly check blockOffsetMap_ while retrieving blockId
-  const ParameterConfig& getParameterConfig(int64_t blockId) const {
-    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
-        << "block idx out of range, id: " << blockId
-        << " info size: " << blockInfos_.size();
-    return *(blockInfos_[blockId].config);
-  }
-
-  template <class Response>
-  bool isValidVectorHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= vectors_.size()) {
-      LOG(ERROR) << "Invalid vector handle " << handle;
-      response->set_return_message(kRetMsgInvalidVectorHandle);
-      return false;
-    }
-    return true;
-  }
-
-  template <class Response>
-  bool isValidMatrixHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= matrices_.size()) {
-      LOG(ERROR) << "Invalid matrix handle " << handle;
-      response->set_return_message(kRetMsgInvalidMatrixHandle);
-      return false;
-    }
-    return true;
-  }
-
-  /**
-   * @brief get block offset
-   *
-   * @note  block.begin_dim is added to the block offset.
-   *        return -1 if block cannot be found
-   */
-  int64_t getBlockOffset(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockOffsetMap_.find(key);
-    if (it == blockOffsetMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /// return -1 if block cannot be found
-  int64_t getBlockId(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockIdMap_.find(key);
-    if (it == blockIdMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify reponse and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses
-   *        vectors_[parameterType] directly
-   *        for dense with sync-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify response and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses buffer->base
-   *        The parameter values are copied from vectors_[parameterType]
-   *        to buffer->base.
-   *        for dense with async-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         Buffer* buffer,
-                         std::vector<Buffer>* outputBuffers);
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  specified for sparse
-   */
-  void sendBackParameterSparse(const ParameterBlock& block,
-                               int parameterType,
-                               SendParameterResponse* response,
-                               Buffer* buffer,
-                               size_t width,
-                               std::vector<Buffer>* outputBuffers);
-
-  /**
-   * framework routine for block parallelization
-   * e.g.
-   * for optimization on all blocks at pserver end, this routine can facilitize
-   * the parallelize of do optimization on all blocks with multithreads.
-   */
-  typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
-  void parallelExecForEachBlock(ExecFunc func);
-  void blockTraverse(BlockInfo& info,
-                     const ParameterConfig& config,
-                     int64_t offset,
-                     size_t size,
-                     const VectorPtr vecs[],
-                     const ParameterOptimizer::TraverseCallback& callback);
-
- public:
-  typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
-                                                     OperationResult* result);
-
-  /**
-   * doOperation will call following operations indirectly
-   * e.g.
-   * for sync-sgd control, the controller in remote updater will send op_SGD
-   * command to pserver, then send sendParameter request to pserver immediately.
-   * the two function at pserver end will do cooperation to achieve the sync-sgd
-   * gradient merge and optimization.
-   * the most following operations are specified for owlqn, all operations are
-   * under the context of doOperation function
-   */
-  static OperatorFunction opFuncs[];
-
-  void op_SGD(const Operation& operation, OperationResult* result);
-
-  void op_RESET(const Operation& operation, OperationResult* result);
-
-  void op_utv(const Operation& operation, OperationResult* result);
-
-  void op_au_bv(const Operation& operation, OperationResult* result);
-
-  void op_COPY(const Operation& operation, OperationResult* result);
-
-  void op_au(const Operation& operation, OperationResult* result);
-
-  void op_au_bv_cw(const Operation& operation, OperationResult* result);
-
-  void op_make_steepest_desc_dir(const Operation& operation,
-                                 OperationResult* result);
-
-  void op_fix_dir_signs(const Operation& operation, OperationResult* result);
-
-  void op_dir_deriv(const Operation& operation, OperationResult* result);
-
-  void op_fix_omega_signs(const Operation& operation, OperationResult* result);
-
-  void op_cost(const Operation& operation, OperationResult* result);
-
-  void op_start_pass(const Operation& operation, OperationResult* result);
-  void op_finish_pass(const Operation& operation, OperationResult* result);
-
-  void op_apply(const Operation& operation, OperationResult* result);
-
-  void op_randomize(const Operation& operation, OperationResult* result);
-
-  void op_load(const Operation& operation, OperationResult* result);
-  void op_save(const Operation& operation, OperationResult* result);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2Main.cpp b/paddle/legacy/pserver/ParameterServer2Main.cpp
deleted file mode 100644
index dfbae0cd0..000000000
--- a/paddle/legacy/pserver/ParameterServer2Main.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include "ParameterServerController.h"
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-
-  std::unique_ptr<ParameterServerController> parameterServerPtr(
-      paddle::ParameterServerController::createFromGflags());
-  parameterServerPtr->start();
-  parameterServerPtr->wait();
-
-  return 0;
-}
diff --git a/paddle/legacy/pserver/ParameterServerController.cpp b/paddle/legacy/pserver/ParameterServerController.cpp
deleted file mode 100644
index 2a7dcc15a..000000000
--- a/paddle/legacy/pserver/ParameterServerController.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterServerController.h"
-
-namespace paddle {
-
-ParameterServerController::ParameterServerController(
-    const ParameterServerConfig& config) {
-  // round robin to load balance RDMA server ENGINE
-  std::vector<std::string> devices;
-  int rdmaCpu = 0;
-  int onlineCpus = rdma::numCpus();
-  int numPorts = config.ports_num() + config.ports_num_for_sparse();
-
-  if (config.nics().empty()) {
-    parameterServers_.resize(numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      if (config.rdma_tcp() == "rdma") {
-        parameterServers_[i].reset(
-            new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
-        rdmaCpu = rdmaCpu % onlineCpus;
-      } else {
-        parameterServers_[i].reset(
-            new ParameterServer2(std::string(), config.port() + i));
-      }
-      CHECK(parameterServers_[i]->init()) << "Fail to initialize parameter "
-                                             "server on port "
-                                          << config.port() + i;
-    }
-  } else {
-    str::split(config.nics(), ',', &devices);
-    parameterServers_.resize(devices.size() * numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      for (size_t j = 0; j < devices.size(); ++j) {
-        if (config.rdma_tcp() == "rdma") {
-          parameterServers_[i * devices.size() + j].reset(new ParameterServer2(
-              getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          parameterServers_[i * devices.size() + j].reset(
-              new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
-        }
-        CHECK(parameterServers_[i * devices.size() + j]->init())
-            << "Fail to initialize parameter server with device " << devices[j]
-            << config.port() + i;
-      }
-    }
-  }
-}
-
-ParameterServerController::~ParameterServerController() { this->wait(); }
-
-ParameterServerController* ParameterServerController::createFromGflags() {
-  ParameterServerConfig config;
-
-  config.set_nics(FLAGS_nics);
-  config.set_rdma_tcp(FLAGS_rdma_tcp);
-  config.set_port(FLAGS_port);
-  config.set_ports_num(FLAGS_ports_num);
-  config.set_ports_num_for_sparse(FLAGS_ports_num_for_sparse);
-
-  return create(config);
-}
-
-ParameterServerController* ParameterServerController::create(
-    const ParameterServerConfig& config) {
-  return new ParameterServerController(config);
-}
-
-void ParameterServerController::start() {
-  LOG(INFO) << "number of parameterServer instances: "
-            << parameterServers_.size();
-  int i = 0;
-  for (const auto& parameterServer : parameterServers_) {
-    LOG(INFO) << "Starting parameterServer[" << i << "]";
-    parameterServer->start();
-    i++;
-  }
-}
-
-void ParameterServerController::wait() {
-  int i = 0;
-  for (const auto& parameterServer : parameterServers_) {
-    LOG(INFO) << "Waiting parameterServer[" << i << "]";
-    parameterServer->join();
-    i++;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServerController.h b/paddle/legacy/pserver/ParameterServerController.h
deleted file mode 100644
index b90d0cbce..000000000
--- a/paddle/legacy/pserver/ParameterServerController.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterServer2.h"
-#include "ParameterServerConfig.pb.h"
-#include "RDMANetwork.h"
-#include "paddle/legacy/utils/StringUtil.h"
-
-namespace paddle {
-
-/**
- * @brief ParameterServerController is used for create, init and manage multi
- * parameter server instances. The num of the instances is decided by port
- * num(the ports number for parameter send) and network devices configured
- * by gflags or proto.
- */
-class ParameterServerController final {
- public:
-  DISABLE_COPY(ParameterServerController);
-
-  /**
-   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
-   */
-  explicit ParameterServerController(const ParameterServerConfig& config);
-
-  /**
-   * @brief Dtor.
-   */
-  ~ParameterServerController();
-
-  /**
-   * @brief create ParameterServerController from gflags, this is used for
-   * compatibility with the old usage of configuration by gflags.
-   */
-  static ParameterServerController* createFromGflags();
-
-  /**
-   * @brief create ParameterServerController with ParameterServerConfig, remove
-   * gflags from ParameterServer. Init all ParameterServer2 instances according
-   * to
-   * the config.
-   */
-  static ParameterServerController* create(const ParameterServerConfig& config);
-
-  /**
-   * @brief start all ParameterServer2 instances in this
-   * ParameterServerController.
-   */
-  void start();
-
-  /**
-   * @brief join and wait for all ParameterServer2 instances thread in this
-   * ParameterServerController.
-   */
-  void wait();
-
- private:
-  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ProtoServer.cpp b/paddle/legacy/pserver/ProtoServer.cpp
deleted file mode 100644
index 6b7948a7d..000000000
--- a/paddle/legacy/pserver/ProtoServer.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ProtoServer.h"
-
-namespace paddle {
-
-void ProtoServer::handleRequest(std::unique_ptr<MsgReader> msgReader,
-                                ResponseCallback callback) {
-  /// 0 for funcName
-  /// 1 for proto
-  CHECK_GE(msgReader->getNumBlocks(), (size_t)2);
-
-  std::string funcName(msgReader->getNextBlockLength(), 0);
-  /// read function name string
-  msgReader->readNextBlock(&funcName[0]);
-  /// looking up rpc wrapped callback function
-  auto it = nameToFuncMap_.find(funcName);
-  if (it != nameToFuncMap_.end()) {
-#ifndef PADDLE_DISABLE_TIMER
-    gettimeofday(&(*(handleRequestBegin_)), nullptr);
-#endif
-    it->second(std::move(msgReader), callback);
-  } else {
-    LOG(ERROR) << "Unknown funcName: " << funcName;
-    std::vector<iovec> iovs;
-    callback(iovs);
-  }
-}
-
-void ProtoServer::registerServiceFunctionImp(const std::string& funcName,
-                                             ServiceFunction func) {
-  CHECK(!nameToFuncMap_.count(funcName)) << "Duplicated registration: "
-                                         << funcName;
-  nameToFuncMap_[funcName] = func;
-}
-
-void ProtoClient::send(const char* funcName,
-                       const google::protobuf::MessageLite& proto,
-                       const std::vector<iovec>& userIovs) {
-  std::string protoStr;
-  CHECK(proto.SerializeToString(&protoStr));
-  std::vector<iovec> iovs;
-  iovs.reserve(iovs.size() + 2);
-  /// sending function name string, protobuf data and user additional data
-  iovs.push_back({(void*)funcName, strlen(funcName)});
-  iovs.push_back({&protoStr[0], protoStr.size()});
-  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
-  channel_->writeMessage(iovs);
-}
-
-std::unique_ptr<MsgReader> ProtoClient::recv(
-    google::protobuf::MessageLite* proto) {
-  std::vector<iovec> iovs;
-  std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
-  CHECK_GE(msgReader->getNumBlocks(), (size_t)1);
-  std::string str(msgReader->getNextBlockLength(), 0);
-  msgReader->readNextBlock(&str[0]);
-  CHECK(proto->ParseFromString(str));
-  return msgReader;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/ProtoServer.h b/paddle/legacy/pserver/ProtoServer.h
deleted file mode 100644
index 2943867de..000000000
--- a/paddle/legacy/pserver/ProtoServer.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "LightNetwork.h"
-
-#include <map>
-
-#include <google/protobuf/message_lite.h>
-
-namespace paddle {
-
-/**
- *
- * It implements the rpc framework, which launchs one thread for each
- * connection. Here define one parameter server as single TCP server
- * binding on single port. All connections share single tcp ProtoServer
- * object, each connection handles all requests from specified trainer
- * within single worker thread.
- * to accelerate bandwidth efficiency and harness multicore for pserver
- * optimization to reduce pserver latency, you could launch more port
- * for single NIC hardward with --port=N(N>1) for small cluster job.
- */
-class ProtoServer : public SocketServer {
- public:
-  /// rdmaCpu controls the cpu affinity of RDMA server daemon,
-  /// which could benifit performance. rdmaCpu = -1 means TCP
-  /// is used instead of RDMA transport.
-  ProtoServer(const std::string& addr, int port, int rdmaCpu = -1)
-      : SocketServer(addr, port, rdmaCpu) {}
-
-  typedef std::function<void(const google::protobuf::MessageLite& protoOut,
-                             const std::vector<iovec>& outputIovs)>
-      ProtoResponseCallbackEx;
-
-  typedef std::function<void(const google::protobuf::MessageLite& protoOut)>
-      ProtoResponseCallback;
-
-  /**
-   * Register a service function for this server
-   * void(const ProtoIn& request,
-   *      ProtoResponseCallback callback)
-   * The service function process the request and call the callback
-   * after it finishes the request.
-
-   * Use macro REGISTER_SERVICE_FUNCTION as a helper
-   * to simplify the use.
-   */
-  template <class ProtoIn>
-  void registerServiceFunction(
-      const std::string& funcName,
-      std::function<void(const ProtoIn& request,
-                         ProtoResponseCallback callback)> func);
-
-  /**
-   * Register a service function for this server
-   * The signature of the service function is
-   * void(const ProtoIn&,
-   *      std::unique_ptr<MsgReader> msgReader,
-   *      ProtoResponseCallbackEx callback)
-   * The service function process the request and call the callback
-   * after it finishes the request.
-   * The extended service function can take extra input blocks from
-   * the communication channel by reading msgReader. It can also
-   * send extra blocks to the communication channel by providing
-   * outputIovs as the argument for the callback function.
-
-   * Use macro REGISTER_SERVICE_FUNCTION_EX as a helper
-   * to simplify the use.
-   */
-  template <class ProtoIn>
-  void registerServiceFunctionEx(
-      const std::string& funcName,
-      std::function<void(const ProtoIn&,
-                         std::unique_ptr<MsgReader> msgReader,
-                         ProtoResponseCallbackEx callback)> func);
-
- protected:
-  /**
-   * @brief handle rpc request
-   * @param[in] msgReader  Message reader for reading data from connection
-   * @param[in] callback   equal to channel->writeMessage
-   *
-   * @note  it lookups rpc function mapping table to find function pointer,
-   *        then call this function with further reading data from connection
-   */
-  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback);
-
-  typedef std::function<void(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback)>
-      ServiceFunction;
-
-  /**
-   * @brief register one RPC function in function mapping
-   * @param[in] funcName  function name string
-   * @param[in] func      rpc function wrapped with reading and writing data
-   */
-  void registerServiceFunctionImp(const std::string& funcName,
-                                  ServiceFunction func);
-
- protected:
-  /// Tuning bare network overhead: the beginning of receiving request
-  ThreadLocal<struct timeval> handleRequestBegin_;
-
-  /// mapping to find rpc function while handling request
-  std::map<std::string, ServiceFunction> nameToFuncMap_;
-};
-
-class ProtoClient : public SocketClient {
- public:
-  ProtoClient(const std::string& serverAddr,
-              int serverPort,
-              enum ChannelType channelType = F_TCP)
-      : SocketClient(serverAddr, serverPort, channelType) {}
-
-  /**
-   * @brief Make a request to the server.
-   * @param[in] funcName  request rpc function name string
-   * @param[in] proto     protobuf data for sending to pserver
-   * @param[in] iov       additional iov data for sending to pserver
-   *
-   * @note  iov provides additional blocks which need to be written to the
-   *        communication channel
-   */
-  void send(const char* funcName,
-            const google::protobuf::MessageLite& proto,
-            const std::vector<iovec>& iov = std::vector<iovec>());
-
-  /**
-   * @brief receive the response from the server.
-   * @param[in] proto     proto binary buffer
-   *
-   * @note  this must be paired with a corresponding send() call. The
-   *        returned MsgReader allows the caller to receive additional
-   *        blocks from the communication channel.
-   */
-  std::unique_ptr<MsgReader> recv(google::protobuf::MessageLite* proto);
-
-  /// combines send() and recv()
-  std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName,
-      const google::protobuf::MessageLite& protoIn,
-      google::protobuf::MessageLite* protoOut) {
-    send(funcName, protoIn);
-    return recv(protoOut);
-  }
-
-  /// combines send() and recv()
-  std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName,
-      const google::protobuf::MessageLite& protoIn,
-      const std::vector<iovec>& iov,
-      google::protobuf::MessageLite* protoOut) {
-    send(funcName, protoIn, iov);
-    return recv(protoOut);
-  }
-};
-
-template <class>
-struct service_arg_type;
-/// helper class for obtaining the argument type of a service function
-template <class R, class C, class Arg1, class Arg2>
-struct service_arg_type<R (C::*)(const Arg1&, Arg2)> {
-  typedef Arg1 _1;
-};
-
-template <class R, class C, class Arg1, class Arg2>
-struct service_arg_type<R (C::*)(  // NOLINT
-    const Arg1&,
-    std::unique_ptr<MsgReader>,
-    Arg2)> {
-  typedef Arg1 _1;
-};
-
-/// register a service function to the ProtoServer
-/// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION(className, funcName)       \
-  registerServiceFunction<                                   \
-      service_arg_type<decltype(&className::funcName)>::_1>( \
-      #funcName,                                             \
-      std::bind(&className::funcName,                        \
-                this,                                        \
-                std::placeholders::_1,                       \
-                std::placeholders::_2))
-
-/// register a service function to the ProtoServer
-/// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)    \
-  registerServiceFunctionEx<                                 \
-      service_arg_type<decltype(&className::funcName)>::_1>( \
-      #funcName,                                             \
-      std::bind(&className::funcName,                        \
-                this,                                        \
-                std::placeholders::_1,                       \
-                std::placeholders::_2,                       \
-                std::placeholders::_3))
-
-/// create wrapper function for parameter server high level function and
-/// register the wrapper function into function mapping.
-template <class ProtoIn>
-void ProtoServer::registerServiceFunctionEx(
-    const std::string& funcName,
-    std::function<void(const ProtoIn&,
-                       std::unique_ptr<MsgReader> msgReader,
-                       ProtoResponseCallbackEx callback)> func) {
-  auto f = [func](std::unique_ptr<MsgReader> msgReader,
-                  ResponseCallback callback) {
-    ProtoIn request;
-    std::string str(msgReader->getNextBlockLength(), 0);
-    msgReader->readNextBlock(&str[0]);
-    CHECK(request.ParseFromString(str));
-    auto pcob = [callback](const google::protobuf::MessageLite& response,
-                           const std::vector<iovec>& outputIovs) {
-      std::string out;
-      CHECK(response.SerializeToString(&out));
-      std::vector<iovec> iovs;
-      iovs.push_back({&out[0], out.size()});
-      iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
-      callback(iovs);
-    };
-
-    func(request, std::move(msgReader), pcob);
-  };
-
-  registerServiceFunctionImp(funcName, f);
-}
-
-template <class ProtoIn>
-void ProtoServer::registerServiceFunction(
-    const std::string& funcName,
-    std::function<void(const ProtoIn&, ProtoResponseCallback callback)> func) {
-  auto f = [func](std::unique_ptr<MsgReader> msgReader,
-                  ResponseCallback callback) {
-    ProtoIn request;
-    std::string str(msgReader->getNextBlockLength(), 0);
-    msgReader->readNextBlock(&str[0]);
-    CHECK(request.ParseFromString(str));
-    msgReader.reset();
-
-    auto pcob = [callback](const google::protobuf::MessageLite& response) {
-      std::string out;
-      CHECK(response.SerializeToString(&out));
-      std::vector<iovec> iovs;
-      iovs.push_back({&out[0], out.size()});
-      callback(iovs);
-    };
-
-    func(request, pcob);
-  };
-
-  registerServiceFunctionImp(funcName, f);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/RDMANetwork.h b/paddle/legacy/pserver/RDMANetwork.h
deleted file mode 100644
index c87056f72..000000000
--- a/paddle/legacy/pserver/RDMANetwork.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_DISABLE_RDMA
-#include "sxi_sock.h"
-#else
-#define PROMPT_ERR() LOG(FATAL) << "Paddle is not compiled with rdma"
-#endif
-#include "paddle/legacy/utils/Logging.h"
-
-#include <netinet/in.h>
-struct sxi_sock;
-struct sxi_socket;
-
-#ifndef MAX_VEC_SIZE
-// define default MAX_VEC_SIZE
-#define MAX_VEC_SIZE (1UL << 16)
-#endif
-
-namespace paddle {
-/// Namespace rdma is adaptors for sxi_sock.h. Make paddle not depend on it
-/// when disable rdma support
-namespace rdma {
-inline int numCpus() {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_num_configured_cpus();
-#else
-  return 0;
-#endif
-}
-
-inline sxi_socket* ssocket(int cpuId) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_ssocket(cpuId);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int listen(sxi_socket* s) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_listen(s);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int bind(sxi_socket* s, const char* str) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_bind(s, str);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_sock* accept(sxi_socket* s) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_accept(s);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return reinterpret_cast<sockaddr_in*>(&sock->sa);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int close(sxi_socket* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_socket_close(sock);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int close(sxi_sock* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_sock_close(sock);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline void init() {
-#ifndef PADDLE_DISABLE_RDMA
-  sxi_module_init();
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_socket* csocket(int cpuId) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_csocket(cpuId);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t read(sxi_sock* channel, void* data, size_t len) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_read(channel, data, len);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t write(sxi_sock* channel, void* data, size_t len) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_write(channel, data, len);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t readv(sxi_sock* channel, iovec* iov, int count) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_readv(channel, iov, count);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t writev(sxi_sock* channel, iovec* iov, int count) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_writev(channel, iov, count);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_sock* connect(sxi_socket* socket, const char* url) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_connect(socket, url);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-}  //  namespace rdma
-}  //  namespace paddle
diff --git a/paddle/legacy/pserver/SocketChannel.cpp b/paddle/legacy/pserver/SocketChannel.cpp
deleted file mode 100644
index 79c763c62..000000000
--- a/paddle/legacy/pserver/SocketChannel.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SocketChannel.h"
-
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include "RDMANetwork.h"
-
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
- * declares it on osx/ios if defined(KERNEL)
- */
-#ifndef UIO_MAXIOV
-#define UIO_MAXIOV 512
-#endif
-
-SocketChannel::~SocketChannel() {
-  if (tcpRdma_ == F_TCP)
-    close(tcpSocket_);
-  else
-    rdma::close(rdmaSocket_);
-  LOG(INFO) << "destory connection in socket channel, peer = " << peerName_;
-}
-
-size_t SocketChannel::read(void* buf, size_t size) {
-  size_t total = 0;
-  while (total < size) {
-    ssize_t len;
-    if (tcpRdma_ == F_TCP)
-      len = ::read(tcpSocket_, (char*)buf + total, size - total);
-    else
-      len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
-
-    CHECK(len >= 0) << " peer=" << peerName_;
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-size_t SocketChannel::write(const void* buf, size_t size) {
-  size_t total = 0;
-  while (total < size) {
-    ssize_t len;
-    if (tcpRdma_ == F_TCP)
-      len = ::write(tcpSocket_, (const char*)buf + total, size - total);
-    else
-      len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
-
-    CHECK(len >= 0) << " peer=" << peerName_;
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-template <class IOFunc, class SocketType>
-static size_t readwritev(IOFunc iofunc,
-                         SocketType socket,
-                         iovec* iovs,
-                         int iovcnt,
-                         int maxiovs,
-                         const std::string& peerName) {
-  int curIov = 0;
-  size_t total = 0;
-
-  for (int i = 0; i < iovcnt; ++i) {
-    total += iovs[i].iov_len;
-  }
-
-  size_t size = 0;
-  size_t curIovSizeDone = 0;
-
-  while (size < total) {
-    ssize_t len =
-        iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
-    CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
-                   << " iovCnt=" << iovcnt
-                   << " iovs[curIov].base=" << iovs[curIov].iov_base
-                   << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
-    size += len;
-
-    /// restore iovs[curIov] to the original value
-    iovs[curIov].iov_base =
-        (void*)((char*)iovs[curIov].iov_base - curIovSizeDone);
-    iovs[curIov].iov_len += curIovSizeDone;
-
-    len += curIovSizeDone;
-
-    while (curIov < iovcnt) {
-      if ((size_t)len < iovs[curIov].iov_len) break;
-      len -= iovs[curIov].iov_len;
-      ++curIov;
-    }
-    if (curIov < iovcnt) {
-      curIovSizeDone = len;
-      iovs[curIov].iov_base = (void*)((char*)iovs[curIov].iov_base + len);
-      iovs[curIov].iov_len -= len;
-    }
-  }
-  return size;
-}
-
-/// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
-/// transfering
-size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
-  if (tcpRdma_ == F_TCP)
-    return readwritev(::writev,
-                      tcpSocket_,
-                      const_cast<iovec*>(&iovs[0]),
-                      iovs.size(),
-                      UIO_MAXIOV,
-                      peerName_);
-  else
-    return readwritev(rdma::writev,
-                      rdmaSocket_,
-                      const_cast<iovec*>(&iovs[0]),
-                      iovs.size(),
-                      MAX_VEC_SIZE,
-                      peerName_);
-}
-
-size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
-  if (tcpRdma_ == F_TCP)
-    return readwritev(::readv,
-                      tcpSocket_,
-                      const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(),
-                      UIO_MAXIOV,
-                      peerName_);
-  else
-    return readwritev(rdma::readv,
-                      rdmaSocket_,
-                      const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(),
-                      MAX_VEC_SIZE,
-                      peerName_);
-}
-
-void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
-  MessageHeader header;
-  header.numIovs = userIovs.size();
-
-  std::vector<size_t> iovLengths;
-  iovLengths.reserve(userIovs.size());
-  for (auto& iov : userIovs) {
-    iovLengths.push_back(iov.iov_len);
-  }
-
-  std::vector<iovec> iovs;
-  iovs.reserve(userIovs.size() + 2);
-  iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0],
-                  static_cast<size_t>(sizeof(iovLengths[0]) * header.numIovs)});
-  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
-
-  header.totalLength = 0;
-  for (auto& iov : iovs) {
-    header.totalLength += iov.iov_len;
-  }
-
-  CHECK(writev(iovs) == (size_t)header.totalLength);
-}
-
-std::unique_ptr<MsgReader> SocketChannel::readMessage() {
-  MessageHeader header;
-
-  size_t len = read(&header, sizeof(header));
-  if (len == 0) {
-    return nullptr;
-  }
-
-  CHECK(len == sizeof(header));
-
-  std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
-
-  CHECK_EQ(msgReader->getTotalLength() + sizeof(header) +
-               msgReader->getNumBlocks() * sizeof(size_t),
-           (size_t)header.totalLength)
-      << " totalLength=" << msgReader->getTotalLength()
-      << " numBlocks=" << msgReader->getNumBlocks();
-  return msgReader;
-}
-
-MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
-    : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
-  size_t size = numBlocks * sizeof(blockLengths_[0]);
-  CHECK(channel_->read(&blockLengths_[0], size) == size);
-}
-
-void MsgReader::readBlocks(const std::vector<void*>& bufs) {
-  CHECK_LE(currentBlockIndex_ + bufs.size(), blockLengths_.size());
-  std::vector<iovec> iovs;
-  iovs.reserve(bufs.size());
-  size_t totalLength = 0;
-  for (void* buf : bufs) {
-    iovs.push_back({buf, getNextBlockLength()});
-    totalLength += getNextBlockLength();
-    ++currentBlockIndex_;
-  }
-
-  CHECK(channel_->readv(&iovs) == totalLength);
-}
-
-void MsgReader::readNextBlock(void* buf) {
-  CHECK_LT(currentBlockIndex_, blockLengths_.size());
-  CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
-  ++currentBlockIndex_;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/SocketChannel.h b/paddle/legacy/pserver/SocketChannel.h
deleted file mode 100644
index a7b3cd42f..000000000
--- a/paddle/legacy/pserver/SocketChannel.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <sys/uio.h>
-
-#include <memory>
-#include <vector>
-
-struct sxi_sock;
-
-namespace paddle {
-
-class SocketChannel;
-enum ChannelType {
-  F_TCP = 1,
-  F_RDMA = 2,
-};
-
-/// reading a set of blocks of data from SocketChannel.
-class MsgReader {
- public:
-  MsgReader(SocketChannel* channel, size_t numIovs);
-  ~MsgReader() {
-    /// ensure all data blocks have been processed
-    CHECK_EQ(currentBlockIndex_, blockLengths_.size());
-  }
-  /**
-   * @brief number of remaining parts
-   */
-  size_t getNumBlocks() const {
-    return blockLengths_.size() - currentBlockIndex_;
-  }
-
-  /**
-   * @brief lenght of next block
-   */
-  size_t getNextBlockLength() const { return getBlockLength(0); }
-
-  /**
-   * @brief get the total length of all the remaining blocks
-   */
-  size_t getTotalLength() const {
-    size_t total = 0;
-    for (size_t i = currentBlockIndex_; i < blockLengths_.size(); ++i) {
-      total += blockLengths_[i];
-    }
-    return total;
-  }
-
-  /**
-   * @brief Get the length for block currentBlockIndex + i
-   */
-  size_t getBlockLength(size_t i) const {
-    return blockLengths_[currentBlockIndex_ + i];
-  }
-
-  /**
-   * @brief  read blocks data and store it to buf
-   */
-  void readBlocks(const std::vector<void*>& bufs);
-  void readNextBlock(void* buf);
-
- protected:
-  SocketChannel* channel_;
-  std::vector<size_t> blockLengths_;
-  size_t currentBlockIndex_;
-};
-
-/// APIs for reading and writing byte stream data or naive iov data
-/// from the APIs both RDMA and TCP exhibits byte stream style
-class SocketChannel {
- public:
-  SocketChannel(int socket, const std::string& peerName)
-      : tcpSocket_(socket), peerName_(peerName) {
-    tcpRdma_ = F_TCP;
-  }
-  SocketChannel(struct sxi_sock* socket, const std::string& peerName)
-      : rdmaSocket_(socket), peerName_(peerName) {
-    tcpRdma_ = F_RDMA;
-  }
-
-  ~SocketChannel();
-
-  const std::string& getPeerName() const { return peerName_; }
-
-  /**
-   * @brief read size bytes.
-   *
-   * @note  keep reading until getting size bytes or sock is closed
-   *        is closed
-   */
-  size_t read(void* buf, size_t size);
-
-  /**
-   * @brief write size bytes.
-   *
-   * @note  keep writing until writing size bytes or sock is closed
-   */
-  size_t write(const void* buf, size_t size);
-
-  /**
-   * @brief write a set of buffers.
-   *
-   * @note  keep writing until all buffers are written or sock is closed
-   */
-  size_t writev(const std::vector<struct iovec>& iov);
-
-  /**
-   * @brief read a set of buffers.
-   *
-   * @note  keep reading until all buffers are full or sock is closed.
-   */
-  size_t readv(std::vector<struct iovec>* iov);
-
-  /**
-   * @brief write a set of buffers.
-   *
-   * @note  keep writing until all buffers are passed or sock is closed
-   */
-  void writeMessage(const std::vector<struct iovec>& iov);
-
-  /// return null to indicate socket is closed
-  std::unique_ptr<MsgReader> readMessage();
-
- protected:
-  struct MessageHeader {
-    int64_t totalLength;  /// include the header
-    int64_t numIovs;
-    int64_t iovLengths[0];
-  };
-
-  int tcpSocket_;
-  struct sxi_sock* rdmaSocket_;
-  const std::string peerName_;
-  enum ChannelType tcpRdma_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.cpp b/paddle/legacy/pserver/SparseParameterDistribution.cpp
deleted file mode 100644
index 3f17b228f..000000000
--- a/paddle/legacy/pserver/SparseParameterDistribution.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Flags.h"
-
-#include "SparseParameterDistribution.h"
-
-DEFINE_bool(check_sparse_distribution_in_pserver,
-            false,
-            "check whether sparse parameter exhibts balanced distribution at "
-            "all pservers");
-DEFINE_bool(show_check_sparse_distribution_log,
-            false,
-            "show logs details for sparse parameter distribution in pserver");
-DEFINE_int32(check_sparse_distribution_batches,
-             100,
-             "run sparse parameter distribution check for N batches");
-DEFINE_double(
-    check_sparse_distribution_ratio,
-    0.6,
-    "if parameters dispatched to different pservers exhibit unbalanced "
-    " distribution for check_sparse_distribution_ratio * "
-    " check_sparse_distribution_batches times, crash program");
-DEFINE_double(check_sparse_distribution_unbalance_degree,
-              2.0,
-              "the ratio of maximum data size and minimun data size for "
-              "different pserver");
-
-namespace paddle {
-
-SparseParameterDistribution::SparseParameterDistribution(size_t serviceNum) {
-  totBytes_ = 0;
-  data_.resize(serviceNum);
-
-  batchPassed_ = 0;
-  unbalanceCnt_ = 0;
-}
-
-void SparseParameterDistribution::probeDistribution(int serverId,
-                                                    size_t dataSize) {
-  if (!FLAGS_check_sparse_distribution_in_pserver ||
-      batchPassed_ > FLAGS_check_sparse_distribution_batches) {
-    return;
-  }
-
-  CHECK_LT((size_t)serverId, data_.size())
-      << "invalid sparse parameter distribution probe";
-
-  data_[serverId] += dataSize;
-  totBytes_ += dataSize;
-}
-
-void SparseParameterDistribution::checkAndResetDistribution() {
-  if (!FLAGS_check_sparse_distribution_in_pserver ||
-      batchPassed_ >= FLAGS_check_sparse_distribution_batches) {
-    return;
-  }
-
-  /// at runtime, prepareSendData is called by many contexts,
-  /// so need to check if data is avaiable.
-  if (!totBytes_) {
-    return;
-  }
-
-  /// check if distribution is balanced
-  auto avgSize = totBytes_ / data_.size();
-  auto unbalanceDegree = FLAGS_check_sparse_distribution_unbalance_degree;
-  for (auto& dataSize : data_) {
-    if (dataSize > unbalanceDegree * avgSize ||
-        dataSize * unbalanceDegree < avgSize) {
-      unbalanceCnt_++;
-      break;
-    }
-  }
-
-  auto printData = [&]() {
-    std::stringstream ss;
-    for (auto& dataSize : data_) {
-      ss << dataSize * 0.001 << "KB ";
-    }
-    ss << std::endl;
-    LOG(INFO) << ss.str();
-  };
-
-  /// show all sparse data size for different pserver
-  if (FLAGS_show_check_sparse_distribution_log) {
-    LOG(INFO) << "sparse distribution:";
-    printData();
-  }
-
-  totBytes_ = 0;
-  batchPassed_++;
-
-  if (batchPassed_ == FLAGS_check_sparse_distribution_batches) {
-    LOG(INFO) << "show last parameter distribution sample:";
-    printData();
-    LOG(INFO) << "total unbalanced batches: " << unbalanceCnt_
-              << " in passed batches: " << batchPassed_;
-    CHECK_LE((float)unbalanceCnt_ / (float)batchPassed_,
-             FLAGS_check_sparse_distribution_ratio)
-        << "unbalanced sparse parameter distribution for different pserver. "
-        << "it could be caused by unbalanced sparse ids distribution, try "
-        << "to shuffle dimensions in input samples";
-  }
-
-  std::fill(data_.begin(), data_.end(), 0);
-}
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.h b/paddle/legacy/pserver/SparseParameterDistribution.h
deleted file mode 100644
index ee7802995..000000000
--- a/paddle/legacy/pserver/SparseParameterDistribution.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <unistd.h>
-
-#include <atomic>
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-/*
- * if sparse_remote_updater is used, different ParameterServer could
- * be assigned with unbalanced gradients. the parameter value from
- * ParameterServer also be not balanced. the distribution of different
- * dimensions of sparse ids determines the unbalanced degree of data
- * distributed among all ParameterServers. Even distribution will
- * benifits cluster efficiency.
- * do check the unbalanced degree of gradients at runtime, crash program
- * if unbalanced distribution exhibts by default.
- */
-class SparseParameterDistribution {
- public:
-  /// serviceNum means the number of ParameterServers
-  explicit SparseParameterDistribution(size_t serviceNum);
-  ~SparseParameterDistribution() {}
-  /// collect data
-  void probeDistribution(int serverId, size_t data);
-  void checkAndResetDistribution();
-
- private:
-  std::vector<size_t> data_;
-  std::atomic<size_t> totBytes_;
-
-  /// after some batches, stop to check
-  int batchPassed_;
-
-  /// stat on unbalanced distribution found
-  int unbalanceCnt_;
-};
-}  // namespace paddle
diff --git a/paddle/legacy/pserver/test/.gitignore b/paddle/legacy/pserver/test/.gitignore
deleted file mode 100644
index aeb58c5b5..000000000
--- a/paddle/legacy/pserver/test/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-log
-test_ParameterServer
-test_ParameterServer2
-socket_test
-test_ProtoServer
diff --git a/paddle/legacy/pserver/test/CMakeLists.txt b/paddle/legacy/pserver/test/CMakeLists.txt
deleted file mode 100644
index b66a00ba0..000000000
--- a/paddle/legacy/pserver/test/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-######################### socket_test ########################
-add_unittest_without_exec(socket_test
-    SocketTest.cpp)
-
-add_test(NAME socket_test
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
-        ${CMAKE_CURRENT_BINARY_DIR}/socket_test --loop_time=10)
-
-####################### test_ProtoServer ####################
-add_unittest_without_exec(test_ProtoServer
-    test_ProtoServer.cpp)
-
-IF(NOT ON_TRAVIS)
-    add_test(NAME test_ProtoServer
-        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
-            ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
-ENDIF(NOT ON_TRAVIS)
-
-# TODO(yuyang18): Run test_ProtoServer when with rdma
-# add_test(NAME test_ProtoServerRDMA
-#   COMMAND ...)
-
-#################### test_ParameterServer2 ####################
-add_unittest_without_exec(test_ParameterServer2
-    test_ParameterServer2.cpp)
-add_test(NAME test_ParameterServer2
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 4
-        ${CMAKE_CURRENT_BINARY_DIR}/test_ParameterServer2)
diff --git a/paddle/legacy/pserver/test/SocketTest.cpp b/paddle/legacy/pserver/test/SocketTest.cpp
deleted file mode 100644
index 3a781fcbf..000000000
--- a/paddle/legacy/pserver/test/SocketTest.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include <thread>
-
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/utils/Logging.h"
-
-struct MessageHeader {
-  int64_t dataLength;
-};
-
-class Thread {
- public:
-  void start();
-  virtual void run() = 0;
-  virtual ~Thread() {}
-
- protected:
-  std::unique_ptr<std::thread> thread_;
-};
-
-void Thread::start() {
-  thread_.reset(new std::thread([this]() { this->run(); }));
-}
-
-class SocketChannel {
- public:
-  explicit SocketChannel(int socket) : socket_(socket) {}
-  int getSocketFd() const { return socket_; }
-  uint64_t readAll(void* buf, size_t size);
-  uint64_t writeAll(const void* buf, size_t size);
-
- protected:
-  int socket_;
-};
-
-uint64_t SocketChannel::readAll(void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = read(socket_, (char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = write(socket_, (const char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-class SocketWorker : public Thread {
- public:
-  explicit SocketWorker(int socket) : channel_(socket) {}
-  virtual void run();
-
-  // read n bytes.
-  int64_t readAll(char* buf, size_t n);
-
-  // write n bytes
-
- protected:
-  SocketChannel channel_;
-  std::string buffer_;
-};
-
-class SocketServer : public Thread {
- public:
-  explicit SocketServer(int port)
-      : port_(port), socket_(0), maxPendingConnections_(100) {}
-
-  virtual void run();
-
- protected:
-  int port_;
-  int socket_;
-  int maxPendingConnections_;
-};
-
-void SocketServer::run() {
-  int newsockfd;
-  socklen_t clilen;
-  struct sockaddr_in serv_addr, cli_addr;
-
-  /* First call to socket() function */
-  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(socket_ >= 0) << "ERROR opening socket";
-
-  /* Initialize socket structure */
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = INADDR_ANY;
-  serv_addr.sin_port = htons(port_);
-
-  /* Now bind the host address using bind() call.*/
-  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR on binding";
-
-  /* Now start listening for the clients, here process will
-   * go in sleep mode and will wait for the incoming connection
-   */
-  listen(socket_, maxPendingConnections_);
-  clilen = sizeof(cli_addr);
-
-  while (true) {
-    /* Accept actual connection from the client */
-    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
-    CHECK(newsockfd >= 0) << "ERROR on accept";
-
-    SocketWorker* worker = new SocketWorker(newsockfd);
-    worker->start();
-  }
-}
-
-void SocketWorker::run() {
-  MessageHeader header;
-
-  while (true) {
-    int64_t n = channel_.readAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-
-    buffer_.resize(header.dataLength);
-    n = channel_.readAll(&buffer_[0], header.dataLength);
-    CHECK(n == header.dataLength) << "ERROR reading from socket";
-
-    /* Write a response to the client */
-    n = channel_.writeAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-    n = channel_.writeAll(buffer_.data(), buffer_.size());
-    CHECK(n == header.dataLength) << "ERROR writing to socket";
-  }
-}
-
-class SocketClient {
- public:
-  SocketClient(const std::string& serverAddr, int serverPort);
-  SocketChannel* getChannel() const { return channel_.get(); }
-
- protected:
-  std::unique_ptr<SocketChannel> channel_;
-};
-
-SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
-  struct sockaddr_in serv_addr;
-  struct hostent* server;
-
-  // char buffer[256];
-
-  /* Create a socket point */
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(sockfd >= 0) << "ERROR opening socket";
-  server = gethostbyname(serverAddr.c_str());
-  CHECK(server) << "ERROR, no such host: " << serverAddr;
-
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  bcopy((char*)server->h_addr,
-        (char*)&serv_addr.sin_addr.s_addr,
-        server->h_length);
-  serv_addr.sin_port = htons(serverPort);
-
-  /* Now connect to the server */
-  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR connecting";
-
-  channel_.reset(new SocketChannel(sockfd));
-}
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 10000000, "Data size");
-DEFINE_int32(loop_time, 100000, "test loop time");
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  SocketServer server(FLAGS_port);
-  server.start();
-  sleep(1);
-
-  SocketClient client(FLAGS_server_addr, FLAGS_port);
-
-  SocketChannel* channel = client.getChannel();
-
-  MessageHeader header;
-
-  uint64_t dataSize = FLAGS_dim * sizeof(real);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-#else
-  CpuVector gpuParam(FLAGS_dim);
-  CpuVector gpuGrad(FLAGS_dim);
-#endif
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int i = 0; i < FLAGS_loop_time; ++i) {
-    cpuGrad.copyFrom(gpuGrad);
-
-    header.dataLength = dataSize;
-    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
-        << "Client write header error";
-
-    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
-        << "Client write data error";
-
-    /* Now read server response */
-    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
-        << "Client read header error";
-
-    CHECK_EQ((uint64_t)header.dataLength, dataSize);
-    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
-        << "Client read data error";
-
-    gpuParam.copyFrom(cpuParam);
-
-    LOG_EVERY_N(INFO, 100) << "i=" << i;
-  }
-  exit(0);
-}
diff --git a/paddle/legacy/pserver/test/test_ParameterServer2.cpp b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
deleted file mode 100644
index 542e80e04..000000000
--- a/paddle/legacy/pserver/test/test_ParameterServer2.cpp
+++ /dev/null
@@ -1,624 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/pserver/ParameterClient2.h>
-#include <paddle/legacy/pserver/ParameterServer2.h>
-#include <paddle/legacy/utils/Flags.h>
-#include <paddle/legacy/utils/Util.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(num_gradient_servers);
-DEFINE_string(server_addr, "127.0.0.1", "assign server address");
-DEFINE_int32(server_cpu, 0, "assign server cpu");
-
-class ParameterServer2Tester : public ParameterServer2 {
- public:
-  ParameterServer2Tester(std::string serverAddr,
-                         int port,
-                         int rdmaCpu = -1,
-                         bool sepSendAndRecv = false)
-      : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
-  virtual ~ParameterServer2Tester() {}
-  void setup() {
-    CHECK(ParameterServer2::init());
-
-    parameters_.clear();
-    clientConfigs_.clear();
-
-    clientConfigs_.resize(2);
-    {
-      ParameterConfig& config = clientConfigs_[0];
-      config.set_name("para0");
-      config.set_para_id(0);
-      config.set_size(10000);
-      config.set_device(-1);
-      config.set_learning_rate(1.0);
-      config.set_momentum(0.9);
-    }
-
-    {
-      ParameterConfig& config = clientConfigs_[1];
-      config.set_name("para1");
-      config.set_para_id(1);
-      config.set_size(5000);
-      config.set_device(-1);
-      config.set_learning_rate(0.5);
-      config.set_momentum(0.4);
-    }
-
-    for (auto& config : clientConfigs_) {
-      parameters_.emplace_back(new Parameter(config, /* useGpu= */ false));
-    }
-
-    size_t id = 0;
-    for (auto& para : parameters_) {
-      para->setID(id++);
-    }
-
-    CHECK(client_.init(parameters_));
-    OptimizationConfig optConfig;
-    optConfig.set_algorithm("async_sgd");
-    optConfig.set_batch_size(100);
-    optConfig.set_learning_rate(0.1);
-    client_.setConfig(optConfig);
-    client_.setParameter();
-  }
-
-  void setConfigTest();
-  void setStatusTest();
-  void sendParameterTest();
-  void sendDataTest(SendDataType type, size_t size);
-  void operationTest();
-  void mergeBlockSegmentTest();
-  void checkSegments(const BlockSegments& expected, const BlockSegments& segs);
-  void waitPassFinishTest();
-  void synchronizeTest();
-
- protected:
-  ParameterClient2 client_;
-  vector<ParameterConfig> clientConfigs_;
-  vector<ParameterPtr> parameters_;
-};
-
-std::unique_ptr<ParameterServer2Tester> g_server;
-
-void ParameterServer2Tester::setConfigTest() {
-  setup();
-
-  for (auto& config : clientConfigs_) {
-    auto it = configMap_.find(config.para_id());
-    EXPECT_TRUE(it != configMap_.end());
-    auto& serverConfig = it->second;
-    EXPECT_EQ(config.name(), serverConfig.name());
-    EXPECT_EQ(config.size(), serverConfig.size());
-    EXPECT_EQ(config.learning_rate(), serverConfig.learning_rate());
-    EXPECT_EQ(config.momentum(), serverConfig.momentum());
-  }
-}
-
-void ParameterServer2Tester::setStatusTest() {
-  setup();
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_NOT_SET));
-  client_.setStatus(PSERVER_STATUS_PARAMETER_READY);
-  EXPECT_EQ(PSERVER_STATUS_PARAMETER_READY, status_);
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_PARAMETER_READY));
-}
-
-real sumVector(const CpuVector& vec) {
-  const real* data = vec.getData();
-  size_t dim = vec.getSize();
-  real sum = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    sum += data[i];
-  }
-  return sum;
-}
-
-void ParameterServer2Tester::sendParameterTest() {
-  setup();
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,       // numSamples = 0
-                                  0,       // cost = 0
-                                  false);  // sendBackParameter = false
-
-  vector<ParameterPtr> parameterCopies;
-
-  for (auto& parameter : parameters_) {
-    parameterCopies.emplace_back(
-        new Parameter(parameter->getConfig(), /* useGpu= */ false));
-    parameterCopies.back()
-        ->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-  }
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,      // numSamples = 0
-                                  0,      // cost = 0
-                                  true);  // sendBackParameter = true
-
-  for (size_t i = 0; i != parameters_.size(); ++i) {
-    real* v1 = parameters_[i]->getBuf(PARAMETER_VALUE)->getData();
-    real* v2 = parameterCopies[i]->getBuf(PARAMETER_VALUE)->getData();
-    EXPECT_EQ(parameters_[i]->getSize(), parameterCopies[i]->getSize());
-    size_t size = parameters_[i]->getSize();
-    real sum1 = 0, sum2 = 0;
-    for (size_t j = 0; j < size; ++j) {
-      sum1 += v1[j];
-      sum2 += v2[j];
-    }
-    EXPECT_EQ(sum1, sum2);
-  }
-}
-
-void ParameterServer2Tester::sendDataTest(SendDataType type, size_t size) {
-  ParameterClient2 client1(true);
-  client1.init(parameters_);
-  ParameterClient2 client2(true);
-  client2.init(parameters_);
-  ParameterClient2 client3(true);
-  client3.init(parameters_);
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  double* testData1 = new double[size];
-  double* testData2 = new double[size];
-  double* testData3 = new double[size];
-  double* getDataExpect = new double[size];
-  double* getDataReal = new double[size];
-  for (size_t i = 0; i < size; ++i) {
-    testData1[i] = rand();  // NOLINT TODO(yuyang18): Use rand_r instead.
-    testData2[i] = rand();  // NOLINT
-    testData3[i] = rand();  // NOLINT
-    getDataExpect[i] = testData1[i] + testData2[i] + testData3[i];
-  }
-
-  auto put1 = [&]() {
-    LOG(INFO) << "putOwnData1 start";
-    client1.putOwnData(0, type, testData1, size);
-    LOG(INFO) << "putOwnData1 finish";
-  };
-
-  auto get1 = [&]() {
-    LOG(INFO) << "sendData1 get all start";
-    client1.getAllData(0, type, getDataReal, size);
-    for (size_t i = 0; i < size; ++i) {
-      CHECK_EQ(getDataReal[i], getDataExpect[i]);
-    }
-    LOG(INFO) << "sendData1 get all finish";
-  };
-
-  auto put2 = [&]() {
-    LOG(INFO) << "putOwnData2 start";
-    client2.putOwnData(1, type, testData2, size);
-    LOG(INFO) << "putOwnData2 finish";
-  };
-
-  auto put3 = [&]() {
-    LOG(INFO) << "putOwnData3 start";
-    client3.putOwnData(2, type, testData3, size);
-    LOG(INFO) << "putOwnData3 finish";
-  };
-
-  worker1.addJob(put1);
-  worker1.addJob(get1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-
-  worker1.addJob(put1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-  worker1.addJob(get1);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-  free(testData1);
-  free(testData2);
-  free(testData3);
-  free(getDataExpect);
-  free(getDataReal);
-}
-
-void ParameterServer2Tester::operationTest() {
-  PServerVector v1, v2;
-  v1 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES, v1.handle);
-
-  v2 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES + 1, v2.handle);
-
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_RESET, v1, (real)1);
-  ops.addOperation(PSERVER_OP_RESET, v2, (real)2);
-
-  real res1, res2, res3;
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res1);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res2);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res3);
-  client_.doOperation(ops, false, false);
-
-  EXPECT_EQ(30000, res1);
-  EXPECT_EQ(15000, res2);
-  EXPECT_EQ(0, res3);
-
-  PServerMatrix m1, m2;
-  m1 = client_.createMatrix(4);
-  EXPECT_EQ(0, m1.handle);
-  m2 = client_.createMatrix(8);
-  EXPECT_EQ(1, m2.handle);
-
-  // TODO(yuyang18): add tests for other operations OP_COPY, OP_au
-
-  client_.releaseVector(v1);
-  client_.releaseVector(v2);
-  client_.releaseMatrix(m1);
-  client_.releaseMatrix(m2);
-}
-
-void ParameterServer2Tester::checkSegments(const BlockSegments& expected,
-                                           const BlockSegments& segs) {
-  EXPECT_EQ(expected.size(), segs.size());
-  if (expected.size() != segs.size()) {
-    return;
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    EXPECT_EQ(expected[i], segs[i]);
-  }
-}
-
-void ParameterServer2Tester::mergeBlockSegmentTest() {
-  {
-    BlockSegments segs{{10, 20}, {30, 45}, {50, 70}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 20}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {10, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 35}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 60}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {30, 47}};
-    mergeSegments(&segs);
-    checkSegments({{30, 47}, {50, 70}}, segs);
-  }
-}
-
-void ParameterServer2Tester::waitPassFinishTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-  ParameterClient2 client3;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto init3 = [&]() {
-    LOG(INFO) << "init3 start";
-    client3.init(parameters_);
-    LOG(INFO) << "init3 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.waitPassFinish();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.waitPassFinish();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  auto op3 = [&]() {
-    LOG(INFO) << "op3 start";
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_SGD);
-    client3.doOperation(ops,
-                        /* waitForGradient= */ true,
-                        /* sendBackarameter= */ true);
-    LOG(INFO) << "op3 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  worker3.addJob(init3);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 2 finished";
-}
-
-void ParameterServer2Tester::synchronizeTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-
-  FLAGS_log_period_server = 2;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    client1.setTrainerId(0);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    client2.setTrainerId(1);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.asyncFinishPass();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.asyncFinishPass();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  // call wait to reset some stats at pserver
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker1.wait();
-  worker2.wait();
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 2 finished";
-}
-
-TEST(ParameterServer2, sendParameter) { g_server->sendParameterTest(); }
-
-TEST(ParameterServer2, setConfig) { g_server->setConfigTest(); }
-
-TEST(ParameterServer2, setStatus) { g_server->setStatusTest(); }
-
-TEST(ParameterServer2, operation) { g_server->operationTest(); }
-
-TEST(ParameterServer2, mergeBlockSegment) { g_server->mergeBlockSegmentTest(); }
-
-TEST(ParameterServer2, waitPassFinish) { g_server->waitPassFinishTest(); }
-
-TEST(ParameterServer2, synchronize) { g_server->synchronizeTest(); }
-
-TEST(ParameterServer2, sendData) {
-  // Set gserver and pserver all 3, so that the test is sufficient.
-  int oldFlagsPortsNUm = FLAGS_ports_num;
-  int oldFlagsNumGradientServers = FLAGS_num_gradient_servers;
-  int oldFlagsPort = FLAGS_port;
-  FLAGS_ports_num = 3;
-  FLAGS_num_gradient_servers = 3;
-  FLAGS_port = FLAGS_port + 1;
-  std::unique_ptr<ParameterServer2Tester> g_server1;
-  std::unique_ptr<ParameterServer2Tester> g_server2;
-  std::unique_ptr<ParameterServer2Tester> g_server3;
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server1.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-    g_server1->start();
-    g_server2.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
-    g_server2->start();
-    g_server3.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 2, FLAGS_server_cpu + 2));
-    g_server3->start();
-  } else {  // tcp
-    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-    g_server1->start();
-    g_server2.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 1));
-    g_server2->start();
-    g_server3.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 2));
-    g_server3->start();
-  }
-
-  g_server2->init();
-  g_server3->init();
-  sleep(2);
-  g_server1->setup();
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 1 << 24);
-  sleep(2);
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 2);
-  sleep(2);
-  g_server1.reset();
-  g_server2.reset();
-  g_server3.reset();
-
-  FLAGS_ports_num = oldFlagsPortsNUm;
-  FLAGS_num_gradient_servers = oldFlagsNumGradientServers;
-  FLAGS_port = oldFlagsPort;
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-
-  FLAGS_num_gradient_servers = 2;
-
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-  } else {
-    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-  }
-
-  g_server->start();
-
-  sleep(2);
-
-  int ret = RUN_ALL_TESTS();
-
-  g_server.reset();
-
-  exit(ret);
-}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.cpp b/paddle/legacy/pserver/test/test_ProtoServer.cpp
deleted file mode 100644
index f7ab2e8af..000000000
--- a/paddle/legacy/pserver/test/test_ProtoServer.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <memory>
-#include "ParameterService.pb.h"
-#include "paddle/legacy/math/Vector.h"
-#include "paddle/legacy/pserver/ProtoServer.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 50000000, "Data size");
-DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
-DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
-
-using namespace paddle;  // NOLINT
-
-class MyServer : public ProtoServer {
- public:
-  explicit MyServer(int port, int rdmaCpu = -1)
-      : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
-        status_(PSERVER_STATUS_NOT_SET) {
-    REGISTER_SERVICE_FUNCTION(MyServer, getStatus);
-    REGISTER_SERVICE_FUNCTION(MyServer, setStatus);
-    REGISTER_SERVICE_FUNCTION_EX(MyServer, getStatusEx);
-  }
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    callback(response);
-  }
-
-  void getStatusEx(const GetStatusRequest& request,
-                   std::unique_ptr<MsgReader> msgReader,
-                   ProtoResponseCallbackEx callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    buffer_.resize(msgReader->getNextBlockLength());
-    msgReader->readNextBlock(&buffer_[0]);
-    callback(response, {{&buffer_[0], buffer_.size()}});
-  }
-
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    SetStatusResponse response;
-    status_ = request.status();
-    callback(response);
-  }
-
- protected:
-  PServerStatus status_;
-  std::string buffer_;
-};
-
-TEST(ProtoServer, regular) {
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    auto msgReader = client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_NOT_SET);
-    EXPECT_EQ(msgReader->getNumBlocks(), (size_t)0);
-  }
-
-  {
-    SetStatusRequest request;
-    SetStatusResponse response;
-    request.set_status(PSERVER_STATUS_PARAMETER_READY);
-    client->sendAndRecv("setStatus", request, &response);
-  }
-
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_PARAMETER_READY);
-  }
-
-  delete client;
-}
-
-TEST(ProtoServer, extended) {
-#ifdef PADDLE_WITH_CUDA
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  int64_t dataSize = FLAGS_dim * sizeof(real);
-
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int k = 0; k < 4; ++k) {
-    for (int i = 0; i < 10; ++i) {
-      cpuGrad.copyFrom(gpuGrad);
-      if (FLAGS_test_proto_server) {
-        GetStatusRequest request;
-        GetStatusResponse response;
-        {
-          REGISTER_TIMER("sendAndRecv");
-          auto msgReader =
-              client->sendAndRecv("getStatusEx",
-                                  request,
-                                  {{cpuGrad.getData(), (size_t)dataSize}},
-                                  &response);
-
-          EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
-          EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
-          msgReader->readNextBlock(cpuParam.getData());
-        }
-        if (!FLAGS_benchmark) {
-          real* v1 = cpuGrad.getData();
-          real* v2 = cpuParam.getData();
-          real sum1 = 0, sum2 = 0;
-          for (int j = 0; j < FLAGS_dim; ++j) {
-            sum1 += v1[j];
-            sum2 += v2[j];
-          }
-          EXPECT_EQ(sum1, sum2);
-        }
-      }
-      gpuParam.copyFrom(cpuParam);
-
-      LOG_EVERY_N(INFO, 10) << "i=" << i;
-    }
-    globalStat.printAllStatus();
-    globalStat.reset();
-  }
-
-  delete client;
-#endif
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
-  server.start();
-  usleep(10000);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.sh b/paddle/legacy/pserver/test/test_ProtoServer.sh
deleted file mode 100755
index 143935084..000000000
--- a/paddle/legacy/pserver/test/test_ProtoServer.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -x
-for ((port=12340;port<=12360;port++))
-do
-    port_used_num=`netstat -a |grep $port|wc -l`
-    if [ $port_used_num -eq 0 ]
-    then
-        echo $port;
-        legacy/pserver/test/test_ProtoServer --port=$port
-        if [ $? -eq 0 ]
-           then
-               exit 0
-           else
-               echo "test_ProtoServer run wrong"
-       	       exit 1
-        fi
-fi
-done
-echo "test_ProtoServer port not found"
-exit 1
diff --git a/paddle/legacy/trainer/CMakeLists.txt b/paddle/legacy/trainer/CMakeLists.txt
deleted file mode 100644
index 6192de438..000000000
--- a/paddle/legacy/trainer/CMakeLists.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-# paddle trainer package
-
-set(TRAINER_SOURCES
-        ParameterUpdater.cpp
-        ParamUtil.cpp
-        RemoteParameterUpdater.cpp
-        NewRemoteParameterUpdater.cpp
-        Tester.cpp
-        Trainer.cpp
-        TrainerInternal.cpp
-        TrainerBenchmark.cpp
-        ThreadParameterUpdater.cpp
-        TrainerInternalConfig.cpp
-        TrainerConfigHelper.cpp)
-
-set(TRAINER_HEADERS
-        ParameterUpdater.h
-        ParamUtil.h
-        RemoteParameterUpdater.h
-        NewRemoteParameterUpdater.h
-        Tester.h
-        TesterConfig.h
-        Trainer.h
-        TrainerInternal.h
-        TrainerInternalConfig.h
-        ThreadParameterUpdater.h
-        TrainerConfigHelper.h)
-
-if(NOT WITH_GOLANG)
-  list(REMOVE_ITEM TRAINER_SOURCES
-          NewRemoteParameterUpdater.cpp)
-  list(REMOVE_ITEM TRAINER_HEADERS
-          NewRemoteParameterUpdater.h)
-endif()
-
-add_library(paddle_trainer_lib STATIC
-    ${TRAINER_SOURCES})
-
-add_dependencies(paddle_trainer_lib
-    paddle_proto
-    ${external_project_dependencies})
-
-macro(add_paddle_exe TARGET_NAME)
-  add_executable(${TARGET_NAME} ${ARGN})
-  link_paddle_exe(${TARGET_NAME})
-endmacro()
-
-if(WITH_TESTING)
-  add_subdirectory(tests)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-  add_paddle_exe(paddle_trainer TrainerMain.cpp)
-  add_paddle_exe(paddle_merge_model MergeModel.cpp)
-
-  install(TARGETS paddle_trainer paddle_merge_model
-          RUNTIME DESTINATION opt/paddle/bin
-          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-
-  set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-  set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-endif()
-
-if(APPLE)
-  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-endif()
-
-if(WITH_GOLANG)
-  add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
-  target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
-  target_link_libraries(paddle_trainer paddle_pserver_cclient)
-endif(WITH_GOLANG)
diff --git a/paddle/legacy/trainer/MergeModel.cpp b/paddle/legacy/trainer/MergeModel.cpp
deleted file mode 100644
index 8a3601f19..000000000
--- a/paddle/legacy/trainer/MergeModel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-
-#include "ParamUtil.h"
-#include "Trainer.h"
-#include "paddle/legacy/pserver/ParameterServer2.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-
-DEFINE_string(model_dir, "", "Directory for separated model files");
-DEFINE_string(config_file, "", "Config file for the model");
-DEFINE_string(model_file, "", "File for merged model file");
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
-      FLAGS_model_file.empty()) {
-    LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
-                 "--config_file=config.py --model_file=out.paddle";
-    return 0;
-  }
-
-  string confFile = FLAGS_config_file;
-#ifndef PADDLE_WITH_CUDA
-  FLAGS_use_gpu = false;
-#endif
-  auto config = std::make_shared<TrainerConfigHelper>(confFile);
-  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
-  gradientMachine->loadParameters(FLAGS_model_dir);
-
-  ofstream os(FLAGS_model_file);
-
-  string buf;
-  config->getConfig().SerializeToString(&buf);
-  int64_t size = buf.size();
-  os.write((char*)&size, sizeof(size));
-  CHECK(os) << "Fail to write to " << FLAGS_model_file;
-  os.write(buf.data(), buf.size());
-  vector<ParameterPtr>& parameters = gradientMachine->getParameters();
-  for (auto& para : parameters) {
-    para->save(os);
-    CHECK(os) << "Fail to write to " << FLAGS_model_file;
-  }
-  os.close();
-
-  return 0;
-}
diff --git a/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp b/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
deleted file mode 100644
index cdd832acd..000000000
--- a/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NewRemoteParameterUpdater.h"
-#include "Trainer.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_int32(trainer_id);
-DECLARE_string(save_dir);
-
-namespace paddle {
-NewRemoteParameterUpdater::NewRemoteParameterUpdater(
-    const OptimizationConfig &config, const std::string pserverSpec)
-    : trainerConfig_(config),
-      parameterClient_(-1),
-      newParameters_(nullptr),
-      newGradients_(nullptr),
-      pserverSpec_(pserverSpec) {}
-
-NewRemoteParameterUpdater::NewRemoteParameterUpdater(
-    const OptimizationConfig &config,
-    const std::string pserverSpec,
-    const bool useEtcd)
-    : trainerConfig_(config),
-      parameterClient_(-1),
-      newParameters_(nullptr),
-      newGradients_(nullptr),
-      pserverSpec_(pserverSpec),
-      useEtcd_(useEtcd) {}
-
-void NewRemoteParameterUpdater::init(
-    const std::vector<ParameterPtr> &parameters) {
-  ParameterUpdater::init(parameters);
-
-  // create parameter server client.
-  if (useEtcd_) {
-    parameterClient_ =
-        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
-  } else {
-    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
-                                                 FLAGS_trainer_id == 0);
-  }
-
-  // init new parameter and gradient.
-  newParameters_ = initNewParameter(PARAMETER_VALUE);
-  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
-
-  // init parameter, one trainer will get the opportunity to int parameter and
-  // send them to parameter server. Others will get the initialized parameter
-  // from parameter server
-  if (paddle_begin_init_params(parameterClient_)) {
-    LOG(INFO) << "paddle_begin_init_params start";
-    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
-    // This makes golang pserver compatible with handy V1 demos.
-    // TODO(wuyi): Refine or remove these ugly converting lines
-    OptimizerConfig optimizerConfigV2;
-    if (trainerConfig_.learning_method() == "momentum") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-    } else if (trainerConfig_.learning_method() == "adagrad") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
-      optimizerConfigV2.mutable_adagrad()->set_epsilon(
-          trainerConfig_.ada_epsilon());
-    } else if (trainerConfig_.learning_method() == "adadelta") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
-      optimizerConfigV2.mutable_adadelta()->set_epsilon(
-          trainerConfig_.ada_epsilon());
-      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
-    } else if (trainerConfig_.learning_method() == "adam") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
-      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
-      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
-      optimizerConfigV2.mutable_adam()->set_epsilon(
-          trainerConfig_.adam_epsilon());
-    } else {
-      LOG(ERROR) << "got unsupported v1 optimizer config: "
-                 << trainerConfig_.learning_method();
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-    }
-
-    if (trainerConfig_.learning_rate_schedule() == "constant") {
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
-      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
-          trainerConfig_.learning_rate_decay_a());
-      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
-          trainerConfig_.learning_rate_decay_b());
-    } else {
-      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
-                 << trainerConfig_.learning_rate_schedule() << ", set to const";
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-    }
-
-    // overwrite optimizerConfigV2 for per-parameter(layer) configs
-    for (int i = 0; i < parameterSize(); ++i) {
-      // FIXME(typhoonzero): paramConfig always have default values,
-      // how to check if it's default?
-      // TODO(typhoonzero): log output: optimizerConfigV2.DebugString();
-      LOG(INFO) << "trainerConfig_: " << trainerConfig_.DebugString();
-      // send param and config to pserver
-      std::string bytes = optimizerConfigV2.SerializeAsString();
-      const char *array = bytes.data();
-      int size = (int)bytes.size();
-      paddle_init_param(
-          parameterClient_, *newParameters_[i], (void *)array, size);
-    }
-    paddle_finish_init_params(parameterClient_);
-    LOG(INFO) << "paddle_begin_init_params done";
-  } else {
-    paddle_get_params(parameterClient_, newParameters_, parameterSize());
-  }
-
-  LOG(INFO) << "NewRemoteParameterUpdater initialized";
-}
-
-void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
-
-void NewRemoteParameterUpdater::finishBatch(real cost) {
-  // send gradient to parameter server.
-  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
-  // get the updated parameter from parameterClient.
-  paddle_get_params(parameterClient_, newParameters_, parameterSize());
-
-  // clear gradient after update parameter.
-  for (auto &para : parameters_) {
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-  }
-}
-
-void NewRemoteParameterUpdater::startPass() {}
-
-bool NewRemoteParameterUpdater::finishPass() { return true; }
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/NewRemoteParameterUpdater.h b/paddle/legacy/trainer/NewRemoteParameterUpdater.h
deleted file mode 100644
index 707e9ceb9..000000000
--- a/paddle/legacy/trainer/NewRemoteParameterUpdater.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <thread>
-#include "OptimizerConfig.pb.h"
-#include "ParameterUpdater.h"
-#include "libpaddle_pserver_cclient.h"
-#include "paddle/legacy/pserver/ParameterClient2.h"
-#include "paddle/legacy/utils/Queue.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-/**
- * New remote parameter updater for dense parameters that use cclient of go.
- */
-class NewRemoteParameterUpdater : public ParameterUpdater {
- public:
-  NewRemoteParameterUpdater(const OptimizationConfig& config,
-                            const std::string pserverSpec);
-  NewRemoteParameterUpdater(const OptimizationConfig& config,
-                            const std::string pserverSpec,
-                            const bool useEtcd);
-  ~NewRemoteParameterUpdater() {
-    releaseNewParameter(newParameters_);
-    releaseNewParameter(newGradients_);
-    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
-  }
-
-  /**
-   * initialize the internal parameter client and itself.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  /**
-   * @brief start batch
-   *
-   * @note  one batch training exhibits stateful feature to help
-   *        to do performance tuning, sgd optimization if necessary.
-   */
-  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
-
-  /**
-   * send parameters to pservers and get returned parameters
-   * from all pservers if necessary.
-   */
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
- protected:
-  /**
-   * work need to do after finishBatch
-   */
-  virtual void updateImpl(Parameter* para);
-
- private:
-  int parameterSize() { return (int)parameters_.size(); }
-
-  /**
-   * init parameter of go paddle pserver cclient.
-   * @param new_params
-   * @param type
-   */
-  paddle_parameter** initNewParameter(ParameterType type) {
-    paddle_parameter** new_params =
-        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
-    for (int i = 0; i < parameterSize(); ++i) {
-      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
-      memset(new_params[i], 0, sizeof(paddle_parameter));
-    }
-
-    for (int i = 0; i < parameterSize(); ++i) {
-      ParameterPtr param = parameters_[i];
-      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-      new_params[i]->name = (char*)param->getName().c_str();
-      new_params[i]->content =
-          (unsigned char*)(param->getBuf(type).get()->getData());
-      new_params[i]->content_len =
-          (int)param->getBuf(type).get()->getSize() * sizeof(real);
-    }
-    return new_params;
-  }
-
-  void releaseNewParameter(paddle_parameter** newParams) {
-    if (newParams != nullptr) {
-      for (int i = 0; i < parameterSize(); ++i) {
-        free(newParams[i]);
-      }
-      free(newParams);
-    }
-  }
-
- protected:
-  const OptimizationConfig& trainerConfig_;
-  /// internal parameter client object for exchanging data with pserver
-  paddle_pserver_client parameterClient_;
-  /// the parameters for new pserver client
-  paddle_parameter** newParameters_;
-  /// the gradinets for new pserver client
-  paddle_parameter** newGradients_;
-  /// the specification of parameter server "host1:port,host1:port"
-  std::string pserverSpec_;
-  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
-  bool useEtcd_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParamUtil.cpp b/paddle/legacy/trainer/ParamUtil.cpp
deleted file mode 100644
index b5aba32de..000000000
--- a/paddle/legacy/trainer/ParamUtil.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParamUtil.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-#include <paddle/legacy/utils/Version.h>
-
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "TesterConfig.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/ValidationLayer.h"
-
-namespace paddle {
-
-ParameterUtil::ParameterUtil(
-    const std::shared_ptr<TrainerConfigHelper> &config,
-    std::unique_ptr<ParameterUtilConfig> &&intconfig,
-    const GradientMachinePtr &gradientMachine,
-    const std::shared_ptr<ParameterUpdater> &parameterUpdater) {
-  config_ = config;
-  intConfig_ = std::move(intconfig);
-  gserver_ = gradientMachine;
-  pUpdater_ = parameterUpdater;
-}
-
-bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "pass-%05d", passId);
-  std::string doneFile = path::join(config_->getSaveDir(), buf, "done");
-  if (!fileExist(doneFile.c_str())) return false;
-  loadParametersWithPath(path::join(config_->getSaveDir(), buf), local, remote);
-  return true;
-}
-
-void ParameterUtil::loadParametersWithPath(const std::string &dir,
-                                           bool local,
-                                           bool remote) {
-  if (local) {
-    gserver_->loadParameters(dir);
-  }
-  if (remote && pUpdater_) {
-    pUpdater_->loadParametersRemote(dir);
-  }
-}
-
-void ParameterUtil::saveParametersOnePass(int passId, int passInnerId) {
-  pUpdater_->apply();
-  saveParameters(passId, passInnerId);
-  if (intConfig_->save_only_one_ && passId >= intConfig_->saving_period_) {
-    deleteParameters(passId - intConfig_->saving_period_);
-  }
-  pUpdater_->restore();
-}
-
-void ParameterUtil::saveParameters(int passId, int passInnerId) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  if (passInnerId > 0) {
-    snprintf(buf, kBufLen, "pass-%05d-%03d", passId, passInnerId);
-  } else {
-    snprintf(buf, kBufLen, "pass-%05d", passId);
-  }
-
-  std::string basePath = config_->getSaveDir();
-  if (basePath.find('/') == std::string::npos) {
-    basePath = "./" + basePath;
-  }
-  mkDirRecursively(basePath.c_str());
-
-  std::string saveDir = path::join(basePath, buf);
-  mkDir(saveDir.c_str());
-  if (!intConfig_->load_save_param_pserver_) {
-    pUpdater_->getParametersRemote(true /*full parameter*/,
-                                   true /*after apply*/);
-  }
-
-  gserver_->saveParameters(saveDir);
-  if (intConfig_->load_save_param_pserver_) {
-    pUpdater_->saveParametersRemote(saveDir);
-  }
-  std::string doneFile = path::join(saveDir, "done");
-  touchFile(doneFile.c_str());
-  std::ofstream out(doneFile);
-  version::printVersion(out);
-  out.close();
-  VLOG(1) << "save dir " << saveDir;
-  saveConfigWithPath(saveDir);
-}
-
-void ParameterUtil::deleteParameters(int passId, int passInnerId) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  const std::string &saveDir = config_->getSaveDir();
-  if (passInnerId > 0) {
-    snprintf(buf,
-             kBufLen,
-             "%s/pass-%05d-%03d",
-             saveDir.c_str(),
-             passId,
-             passInnerId);
-  } else {
-    snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
-  }
-  mkDir(saveDir.c_str());
-  LOG(INFO) << "delete dir " << buf;
-  rmDir(buf);
-}
-
-void ParameterUtil::saveConfigWithPath(const std::string &path) {
-  std::string src;
-  // save config in some path
-  if (!intConfig_->config_.empty()) {
-    src = intConfig_->config_;
-  } else {
-    bool ok;
-    src = config_->getConfigName(&ok);
-    if (!ok) {
-      return;
-    }
-  }
-  copyFileToPath(src, path);
-
-  // save other import config file name to path.txt
-  std::string ss = path::join(path, "path.txt");
-  std::ofstream os(ss);
-  std::string fileName = path::basename(src);
-  CHECK(os.write(fileName.c_str(), fileName.length()))
-      << "Fail to write config file name " << ss;
-  VLOG(1) << "fileName " << fileName;
-  os.close();
-
-  // copy other import config files
-  for (int i = 0; i < config_->getConfig().config_files_size(); ++i) {
-    copyFileToPath(config_->getConfig().config_files(i), path);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParamUtil.h b/paddle/legacy/trainer/ParamUtil.h
deleted file mode 100644
index 077869677..000000000
--- a/paddle/legacy/trainer/ParamUtil.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParameterUpdater.h"
-#include "TrainerConfig.pb.h"
-#include "TrainerConfigHelper.h"
-
-namespace paddle {
-
-/**
- * Configuration for parameter utils.
- */
-struct ParameterUtilConfig {
-  DISABLE_COPY(ParameterUtilConfig);
-
-  ParameterUtilConfig(bool save_only_one,
-                      int saving_period,
-                      bool load_save_parameters_in_pserver,
-                      std::string config)
-      : save_only_one_(save_only_one),
-        saving_period_(saving_period),
-        load_save_param_pserver_(load_save_parameters_in_pserver),
-        config_(config) {}
-
-  bool save_only_one_;
-  int saving_period_;
-  bool load_save_param_pserver_;
-  std::string config_;
-};
-
-/**
- * ParameterUtil
- * Utility class for loading and saving parameters
- */
-class ParameterUtil {
- public:
-  /**
-   * Ctor.
-   *
-   * @param config
-   * @param intconfig
-   * @param gradientMachine
-   * @param parameterUpdater
-   * @return
-   */
-  ParameterUtil(const std::shared_ptr<TrainerConfigHelper> &config,
-                std::unique_ptr<ParameterUtilConfig> &&intconfig,
-                const GradientMachinePtr &gradientMachine,
-                const std::shared_ptr<ParameterUpdater> &parameterUpdater);
-
-  /// Load parameter from the saved parameter file as pass passId
-  /// if loadsave_parameters_in_pserver is set, some parameters MUST
-  /// load in pserver, which is "remote".
-  /// loadParameters can choose to load local/remote parameter, or both.
-  bool loadParameters(int passId, bool local = true, bool remote = false);
-
-  /// load parameters given path info
-  void loadParametersWithPath(const std::string &dir,
-                              bool local = true,
-                              bool remote = false);
-
-  /// Save parameter to dist for pass passId
-  /// passInnerId means saving times in one pass, some users want to
-  /// save parameters when have processed some batches in one pass
-  /// passInnerId = 0 means do not need to save in one inner pass
-  void saveParameters(int passId, int passInnerId = 0);
-
-  /// save parameters for one pass, when passInnerId > 0 means saving
-  /// the passInnerId times in one pass
-  void saveParametersOnePass(int passId, int passInnerId = 0);
-
-  /// delete parameter from disk via passId
-  void deleteParameters(int passId, int passInnerId = 0);
-
-  /// save config given path info
-  void saveConfigWithPath(const std::string &path);
-
-  /**
-   * Try to load parameter from config.
-   * @return true if can load from trainer config.
-   */
-  inline bool tryLoadParametersFromConfig() {
-    auto &c = config_->getConfig();
-    if (!c.init_model_path().empty()) {
-      loadParametersWithPath(c.init_model_path());
-      return true;
-    } else if (c.start_pass() > 0) {
-      CHECK(loadParameters(c.start_pass() - 1));
-      return true;
-    } else {
-      return false;
-    }
-  }
-
- private:
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<ParameterUtilConfig> intConfig_;
-  GradientMachinePtr gserver_;
-  std::shared_ptr<ParameterUpdater> pUpdater_;
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/trainer/ParameterUpdater.cpp b/paddle/legacy/trainer/ParameterUpdater.cpp
deleted file mode 100644
index 549fb0332..000000000
--- a/paddle/legacy/trainer/ParameterUpdater.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdater.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/utils/Thread.h"
-
-namespace paddle {
-
-static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
-static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
-
-SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
-    const OptimizationConfig& optConfig)
-    : SgdLocalUpdater(optConfig, false /*with averager*/) {
-  CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
-  averager_.reset(AverageOptimizer::create(optConfig,
-                                           new DummyOptimizer(optConfig),
-                                           false /*sparse*/,
-                                           true /*apply*/));
-  updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
-}
-
-void SgdUpdaterWithCpuAverager::init(
-    const std::vector<ParameterPtr>& parameters) {
-  SgdLocalUpdater::init(parameters);
-  averager_->init(parameters_.size(), nullptr);
-  copyEvents_.resize(parameters_.size());
-  for (auto& parameter : parameters) {
-    SetDevice device(parameter->getDeviceId());
-    cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
-                                              /* useGpu= */ false,
-                                              /* doInit= */ false));
-    if (parameter->useGpu()) {
-      cpuParameters_.back()->enableType(PARAMETER_APPLY);
-    } else {
-      cpuParameters_.back()->enableSharedType(
-          PARAMETER_APPLY, parameter->getBuf(PARAMETER_VALUE));
-    }
-    for (ParameterType type : averager_->getParameterTypes()) {
-      cpuParameters_.back()->enableType(type);
-    }
-
-    hl_create_event(&copyEvents_[nonStaticParaIDMap_[parameter->getID()]]);
-  }
-}
-
-SgdUpdaterWithCpuAverager::~SgdUpdaterWithCpuAverager() {
-  for (auto& event : copyEvents_) {
-    hl_destroy_event(event);
-  }
-}
-
-void SgdUpdaterWithCpuAverager::updateImpl(Parameter* para) {
-  SgdLocalUpdater::updateImpl(para);
-
-  if (para->useGpu()) {
-    size_t pid = nonStaticParaIDMap_[para->getID()];
-    Parameter* cpuPara = cpuParameters_[pid].get();
-    cpuPara->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kDeviceToHostStream);
-    hl_stream_record_event(kDeviceToHostStream, copyEvents_[pid]);
-  }
-
-  updateWorker_.addJob(
-      std::bind(&SgdUpdaterWithCpuAverager::updateFunc, this, para));
-}
-
-void SgdUpdaterWithCpuAverager::updateFunc(Parameter* para) {
-  SetDevice setDevice(para->getDeviceId());
-  size_t pid = nonStaticParaIDMap_[para->getID()];
-  Parameter* cpuPara = cpuParameters_[pid].get();
-  if (para->useGpu()) {
-    hl_event_synchronize(copyEvents_[pid]);
-  }
-  averager_->update(cpuPara->getBufs(), cpuPara->getConfig(), -1LU);
-}
-
-void SgdUpdaterWithCpuAverager::finishBatch(real cost) {
-  SgdLocalUpdater::finishBatch(cost);
-
-  updateWorker_.wait();
-  for (auto para : cpuParameters_) {
-    if (auto callback = averager_->needSpecialTraversal(para->getConfig())) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-  averager_->finishBatch();
-}
-
-void SgdUpdaterWithCpuAverager::apply() {
-  // backup gpu value
-  for (auto& para : parameters_) {
-    SetDevice setDevice(para->getDeviceId());
-    para->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kHostToDeviceStream);
-  }
-
-  // apply on cpu parameter
-  if (auto callback = averager_->apply()) {
-    for (auto para : cpuParameters_) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-
-  // copy to gpu value
-  for (auto& para : parameters_) {
-    SetDevice setDevice(para->getDeviceId());
-    size_t pid = nonStaticParaIDMap_[para->getID()];
-    Parameter* cpuPara = cpuParameters_[pid].get();
-    if (parameters_[pid]->useGpu()) {
-      para->getBuf(PARAMETER_VALUE)
-          ->copyFrom(*cpuPara->getBuf(PARAMETER_APPLY), kHostToDeviceStream);
-    }
-  }
-  hl_stream_synchronize(kHostToDeviceStream);
-  for (auto& para : parameters_) {
-    para->setValueUpdated();
-  }
-}
-
-void SgdUpdaterWithCpuAverager::restore() {
-  // restore on cpu parameter
-  if (auto callback = averager_->restore()) {
-    for (auto para : cpuParameters_) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-
-  // restore gpu value
-  for (auto& para : parameters_) {
-    SetDevice device(para->getDeviceId());
-    para->getBuf(PARAMETER_VALUE)->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-    para->setValueUpdated();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParameterUpdater.h b/paddle/legacy/trainer/ParameterUpdater.h
deleted file mode 100644
index acddc3702..000000000
--- a/paddle/legacy/trainer/ParameterUpdater.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Thread.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "paddle/legacy/parameter/AverageOptimizer.h"
-#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
-#include "paddle/legacy/parameter/OptimizerFunctions.h"
-#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
-
-#include "TrainerConfig.pb.h"
-#include "paddle/legacy/gserver/layers/Layer.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-
-/**
- * @brief Parameter Updater for SGD, and local(not cluster) run.
- */
-class SgdLocalUpdater : public ParameterUpdater {
- public:
-  /**
-   * @brief Ctor. Initialize optimizer locally by optConfig.
-   * @param optConfig optimization config.
-   * @param withAverager with average optimizer or not, default is true.
-   */
-  explicit SgdLocalUpdater(const OptimizationConfig& optConfig,
-                           bool withAverager = true)
-      : numSamplesProcessed_(0) {
-    auto baseOptimizer = ParameterOptimizer::create(optConfig);
-    optimizer_.reset(withAverager
-                         ? AverageOptimizer::create(optConfig, baseOptimizer)
-                         : baseOptimizer);
-    CHECK(optimizer_) << "fail to create optimizer: "
-                      << optConfig.learning_method();
-    auto types = optimizer_->getParameterTypes();
-    for (auto type : types) {
-      addParameterType(type);
-    }
-  }
-
-  /**
-   * @brief Initialize parameters and optimizer_.
-   *        For example,
-   *           If optimizer need hassien vector, then parameter's hassien will
-   *           be initialized.
-   * @param parameters The parameter need to be initialized.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters) {
-    ParameterUpdater::init(parameters);
-    optimizer_->init(parameters_.size(), nullptr);
-    // check no L1 decay in parameter configs
-    CHECK(std::find_if(parameters.begin(),
-                       parameters.end(),
-                       [](const ParameterPtr& para) {
-                         return para->getConfig().decay_rate_l1() > 0.0f;
-                       }) == parameters.end())
-        << "SgdLocalUpdater cannot support L1 decay in parameter";
-  }
-
-  /**
-   * @brief Start a batch with current mini-batch size
-   * @param current mini-batch size.
-   * @return Always PASS_TRAIN.
-   */
-  virtual PassType startBatch(int64_t batchSize) {
-    numSamplesProcessed_ += batchSize;
-    optimizer_->startBatch(numSamplesProcessed_);
-    return PASS_TRAIN;
-  }
-
-  /**
-   * @brief finish a mini-batch.
-   */
-  virtual void finishBatch(real cost) { optimizer_->finishBatch(); }
-
-  /**
-   * @brief start a pass.
-   */
-  virtual void startPass() { optimizer_->startPass(); }
-
-  /**
-   * @brief finish a pass.
-   * @param cost sum cost during one pass.
-   * @return true if accept (used for owlqn).
-   */
-  virtual bool finishPass() {
-    optimizer_->finishPass();
-    return ParameterUpdater::finishPass();
-  }
-
-  /**
-   * @brief apply model average.
-   */
-  virtual void apply() {
-    if (auto callback = optimizer_->apply()) {
-      for (auto para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        callback(para->getBufs(), para->getConfig(), -1UL);
-      }
-    }
-  }
-
-  /**
-   * @brief restore parameter value before model average
-   */
-  virtual void restore() {
-    if (auto callback = optimizer_->restore()) {
-      for (auto para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        callback(para->getBufs(), para->getConfig(), -1UL);
-      }
-    }
-  }
-
- protected:
-  /**
-   * @brief update method. Update value from gradient.
-   * @param para parameter that will be updated.
-   */
-  virtual void updateImpl(Parameter* para) {
-    optimizer_->update(para->getBufs(), para->getConfig());
-    if (auto callback = optimizer_->needSpecialTraversal(para->getConfig())) {
-      callback(para->getBufs(), para->getConfig(), -1UL);
-    }
-
-    para->setValueUpdated();
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-  }
-
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-
-  /**
-   * @brief total number of samples processed.
-   */
-  int64_t numSamplesProcessed_;
-};
-
-/**
- * @brief SgdCpuUpdater is used only in recursive neural network
- * @deprecated
- */
-class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
- public:
-  explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
-      : SgdLocalUpdater(optConfig),
-        Deprecated(
-            "SgdCpuUpdater is used only in recursive neural network, "
-            "and recursive neural network is deprecated in paddle. "
-            "Use it all by your own.") {}
-
-  /**
-   * @brief update all parameter on finish batch.
-   * @param cost
-   */
-  virtual void finishBatch(real cost) {
-    for (auto para : parameters_) {
-      SgdLocalUpdater::update(para.get());
-    }
-    optimizer_->finishBatch();
-  }
-
- protected:
-  /**
-   * @brief do nothing.
-   * @param para
-   */
-  virtual void updateImpl(Parameter* para) {}
-};
-
-/**
- * @brief Sgd Local Updater With average in cpu.
- *
- * It will do model average in cpu to reduce gpu memory comsuption.
- */
-class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
- public:
-  /**
-   * @brief Ctor.
-   *
-   * SgdUpdaterWithCpuAverager will do everything as a
-   * SgdLocalUpdater, then copy parameter from GPU to CPU, and do model
-   * average in cpu.
-   */
-  explicit SgdUpdaterWithCpuAverager(const OptimizationConfig& optConfig);
-  ~SgdUpdaterWithCpuAverager();
-
-  /**
-   * @brief init. Initialize cpu parameters, model average optimizer.
-   * @param parameters
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  virtual PassType startBatch(int64_t batchSize) {
-    averager_->startBatch(-1UL);
-    return SgdLocalUpdater::startBatch(batchSize);
-  }
-  virtual void finishBatch(real cost);
-
-  virtual void startPass() {
-    averager_->startPass();
-    SgdLocalUpdater::startPass();
-  }
-  virtual bool finishPass() {
-    averager_->finishPass();
-    return SgdLocalUpdater::finishPass();
-  }
-
-  /// apply the averaged parameter to PARAMETER_VALUE
-  /// use PARAETER_GRADIENT for backing up PARAMETER_VALUE
-  virtual void apply();
-
-  /**
-   * @brief Restore parameter before apply().
-   */
-  virtual void restore();
-
- protected:
-  virtual void updateImpl(Parameter* para);
-
-  void updateFunc(Parameter* para);
-
- protected:
-  std::unique_ptr<ParameterOptimizer> averager_;
-
-  /**
-   * @brief The thread worker which do model average.
-   *
-   * For each parameter, GPU->CPU parameter is async, and do model average in
-   * another thread. Because the training process don't need model average while
-   * training, and model average only used in evaluation stage and saving stage.
-   * So the model average is totally async.
-   */
-  ThreadWorker updateWorker_;
-
-  /**
-   * @brief The parameter mirror in cpu.
-   */
-  std::vector<ParameterPtr> cpuParameters_;
-
-  /**
-   * @brief GPU -> CPU copy event. Model average will wait after copy done.
-   */
-  std::vector<hl_event_t> copyEvents_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/RemoteParameterUpdater.cpp b/paddle/legacy/trainer/RemoteParameterUpdater.cpp
deleted file mode 100644
index 5de1cc782..000000000
--- a/paddle/legacy/trainer/RemoteParameterUpdater.cpp
+++ /dev/null
@@ -1,843 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RemoteParameterUpdater.h"
-#include "Trainer.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/Stat.h"
-
-DECLARE_int32(trainer_id);
-DECLARE_string(save_dir);
-
-namespace paddle {
-
-static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
-static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
-static const int kFinishBatchPid = -1;
-
-const std::string RemoteParameterUpdater::kAverage = "average";
-const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
-
-RemoteParameterUpdater::RemoteParameterUpdater(
-    const OptimizationConfig& config,
-    int expectedPassCount,
-    std::unique_ptr<ParameterUpdater>&& localUpdater)
-    : config_(config),
-      localUpdater_(std::move(localUpdater)),
-      numBatches_(0),
-      passCount_(0),
-      expectedPassCount_(expectedPassCount),
-      separateSendAndRecv_(false),
-      isFirstPass_(true),
-      useApplyInPserver_(false) {
-  addParameterType(PARAMETER_MOMENTUM);
-}
-
-void RemoteParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  if (localUpdater_) {
-    localUpdater_->init(parameters);
-
-    for (auto& parameter : parameters) {
-      parameter->enableType(PARAMETER_DELTA);
-    }
-
-    CHECK(config_.center_parameter_update_method() == kAverage ||
-          config_.center_parameter_update_method() == kElasticAverage)
-        << "unknown center_parameter_update_method";
-
-    // modify delta_add_rate
-    CHECK_GT(FLAGS_num_gradient_servers, 1)
-        << "FLAGS_num_gradient_servers should be set in trainer args.";
-    real delta_add_rate = config_.delta_add_rate() / FLAGS_num_gradient_servers;
-    config_.set_delta_add_rate(delta_add_rate);
-    LOG(INFO) << "center parameter in pserver,"
-              << " modify delta_add_rate=" << delta_add_rate;
-  }
-
-  if (!FLAGS_use_gpu) {
-    cpuParameters_ = parameters;
-  } else {
-    for (auto& parameter : parameters) {
-      cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
-                                                /* useGpu= */ false));
-      cpuParameters_.back()->setID(parameter->getID());
-      if (localUpdater_) {
-        cpuParameters_.back()->enableType(PARAMETER_DELTA);
-      }
-    }
-  }
-
-  parameterClient_.reset(new ParameterClient2(separateSendAndRecv_));
-  parameterClient_->init(cpuParameters_);
-  parameterClient_->setTrainerId(FLAGS_trainer_id);
-
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(config_);
-    copyParametersFromDevice(PARAMETER_VALUE);
-    parameterClient_->setParameter();
-    parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
-  } else {
-    parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
-    parameterClient_->getParameter();
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-  if (FLAGS_trainer_id == 0 &&
-      (config_.algorithm() != TrainAlgorithm::AsyncSGD)) {
-    startController();
-    useApplyInPserver_ = useApplyInPserver(config_);
-  }
-}
-
-void RemoteParameterUpdater::startController() {
-  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
-}
-
-void RemoteParameterUpdater::controller() {
-  ParameterClient2 client(false);
-  client.init(cpuParameters_);
-  while (true) {
-    /*start pass*/ {
-      client.waitPassStart();
-
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ false,
-                         /* sendBackarameter= */ false,
-                         /* releasePass= */ false);
-    }
-
-    while (true) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_SGD);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ false);
-      if (client.isPassFinish()) {
-        break;
-      }
-    }
-
-    /*finish pass*/ {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ true);
-    }
-
-    passCount_++;
-    if (passCount_ == expectedPassCount_) {
-      break;
-    }
-  }
-}
-
-void RemoteParameterUpdater::copyParametersToDevice(
-    ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int numParameters = cpuParameters_.size();
-  for (int i = 0; i < numParameters; ++i) {
-    parameters_[i]
-        ->getBuf(parameterType)
-        ->copyFrom(*cpuParameters_[i]->getBuf(parameterType));
-    if (parameterType == PARAMETER_VALUE) {
-      parameters_[i]->setValueUpdated();
-    }
-  }
-}
-
-void RemoteParameterUpdater::copyParametersFromDevice(
-    ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int numParameters = cpuParameters_.size();
-  for (int i = 0; i < numParameters; ++i) {
-    cpuParameters_[i]
-        ->getBuf(parameterType)
-        ->copyFrom(*parameters_[i]->getBuf(parameterType));
-  }
-}
-
-void RemoteParameterUpdater::updateImpl(Parameter* para) {
-  REGISTER_TIMER("update");
-  if (localUpdater_) {
-    localUpdater_->update(para);
-  }
-}
-
-void RemoteParameterUpdater::finishBatch(real cost) {
-  if (localUpdater_) {
-    localUpdater_->finishBatch(cost);
-  }
-
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-
-  ParameterType sendType;
-  bool sendBackParameter = true;
-  if (localUpdater_) {
-    ++numBatches_;
-    if (numBatches_ % config_.num_batches_per_send_parameter() != 0) {
-      return;
-    }
-
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      parameterClient_->getParameter(PARAMETER_DELTA);
-      copyParametersToDevice(PARAMETER_DELTA);
-      sendBackParameter = false;  // no need send back after send
-
-      // calc delta
-      for (auto& para : parameters_) {
-        // DELTA = LOCAL_VALUE - CENTER_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-
-        // when delta send to pserver, pserver will do:
-        // CENTER_VALUE += alpha * (LOCAL_VALUE - CENTER_VALUE)
-      }
-    } else {
-      // calc delta
-      for (auto& para : parameters_) {
-        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-      }
-    }
-
-    sendType = PARAMETER_DELTA;
-
-  } else {
-    // In this case, we perform SGD on pserver.
-    sendType = PARAMETER_GRADIENT;
-  }
-
-  copyParametersFromDevice(sendType);
-
-  {
-    REGISTER_TIMER("sendAndRecv_dense");
-    parameterClient_->sendAndReceiveParameter(mode,
-                                              sendType,
-                                              batchSize_,
-                                              0,  // cost = 0
-                                              sendBackParameter);
-  }
-
-  if (sendBackParameter) {
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-
-  if (localUpdater_) {
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        // LOCAL_VALUE += -alpha * (LOCAL_VALUE - CENTER_VALUE)
-        para->getBuf(PARAMETER_VALUE)
-            ->add(*para->getBuf(PARAMETER_DELTA), -config_.delta_add_rate());
-      }
-
-    } else {  // average
-      // copy value to delta
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  } else {
-    for (auto& para : parameters_) {
-      SetDevice device(para->getDeviceId());
-      para->getBuf(sendType)->zeroMem();
-    }
-  }
-}
-
-void RemoteParameterUpdater::startPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassStart();
-  } else {
-    // sync could benifits reducing lagged trainer for async-sgd
-    // even if sync could not remove all lagged trainer for the
-    // sake of file loading, buffer etc.
-    parameterClient_->asyncStartPass();
-  }
-
-  if (localUpdater_) {
-    localUpdater_->startPass();
-    numBatches_ = 0;
-
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      if (!isFirstPass_) {
-        // restore local value from delta
-        for (auto& para : parameters_) {
-          SetDevice device(para->getDeviceId());
-          para->getBuf(PARAMETER_VALUE)
-              ->copyFrom(*para->getBuf(PARAMETER_DELTA));
-        }
-      }
-    } else {  // average
-      // copy value to delta
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  }
-}
-
-bool RemoteParameterUpdater::finishPass() {
-  if (localUpdater_) {
-    localUpdater_->finishPass();
-  }
-
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassFinish();
-  } else {
-    parameterClient_->asyncFinishPass();
-  }
-  if (localUpdater_) {
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      // backup local value to delta as we will get
-      // the remote parameter for saving/testing
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  }
-  parameterClient_->getParameter();
-  copyParametersToDevice(PARAMETER_VALUE);
-
-  isFirstPass_ = false;
-  return true;
-}
-
-void RemoteParameterUpdater::apply() {
-  if (useApplyInPserver_) {
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_APPLY);
-    parameterClient_->doOperation(ops,
-                                  /* waitForGradient= */ false,
-                                  /* sendBackarameter= */ false);
-    parameterClient_->getParameter(
-        /* recvParameterType= */ PARAMETER_VALUE,
-        /* sendBackParameterType= */ PARAMETER_APPLY);
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-}
-
-void RemoteParameterUpdater::restore() {
-  if (useApplyInPserver_) {
-    parameterClient_->getParameter();
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-}
-
-ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
-    OptimizationConfig config,
-    int passCount,
-    std::unique_ptr<ParameterUpdater>&& localUpdater)
-    : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
-  sendThread_.reset(new std::thread([this]() { this->send(); }));
-  recvThread_.reset(new std::thread([this]() { this->recv(); }));
-
-  stopping_ = false;
-  oneBatchFinished_ = false;
-  separateSendAndRecv_ = true;
-}
-
-ConcurrentRemoteParameterUpdater::~ConcurrentRemoteParameterUpdater() {
-  stopping_ = true;
-  sendQueue_.enqueue(0);
-  sendThread_->join();
-  recvQueue_.enqueue(0);
-  recvThread_->join();
-}
-
-void ConcurrentRemoteParameterUpdater::finishBatch(real cost) {
-  if (localUpdater_) {
-    localUpdater_->finishBatch(cost);
-
-    if (!needToUpdateRemotely()) {
-      ++numBatches_;
-      return;
-    }
-  }
-
-  sendQueue_.enqueue(kFinishBatchPid);
-
-  finishBatchCond_.wait([this]() { return oneBatchFinished_; });
-  oneBatchFinished_ = false;
-  {
-    REGISTER_TIMER("sync_hostToDeviceStream");
-    for (auto& para : parameters_) {
-      SetDevice device(para->getDeviceId());
-      hl_stream_synchronize(kHostToDeviceStream);
-    }
-  }
-
-  if (localUpdater_) {
-    ++numBatches_;
-  }
-}
-
-// Use para=NULL to signal the end of one batch
-void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-  ParameterType sendType;
-  if (localUpdater_) {
-    sendType = PARAMETER_DELTA;
-  } else {
-    // In this case, we perform SGD on pserver.
-    sendType = PARAMETER_GRADIENT;
-  }
-  std::vector<ParameterSegments> paraSegment;
-  if (para == NULL) {
-    parameterClient_->sendParameter(
-        mode,
-        sendType,
-        paraSegment,
-        batchSize_,
-        0,              // cost=0
-        true,           // sendBackParameter = true
-        batchStatus_);  // batchStatus_ = BATCH_FINISH
-
-  } else {
-    ParameterSegments paraSegTemp;
-    paraSegment.reserve(1);
-    paraSegTemp.name = para->getName();
-    paraSegTemp.id = para->getID();
-    paraSegment.push_back(paraSegTemp);
-    {
-      SetDevice device(para->getDeviceId());
-      REGISTER_TIMER("copySingleParaFromDevice");
-      copySingleParaFromDevice(para, sendType);
-      hl_stream_synchronize(kDeviceToHostStream);
-    }
-    parameterClient_->sendParameter(mode,
-                                    sendType,
-                                    paraSegment,
-                                    batchSize_,
-                                    0,     // cost=0
-                                    true,  // sendBackParameter = true
-                                    batchStatus_);
-    if (batchStatus_ == BATCH_START) batchStatus_ = BATCH_ON;
-  }
-}
-void ConcurrentRemoteParameterUpdater::recv(Parameter* para) {
-  parameterClient_->recvParameter();
-  if (para != NULL) {
-    REGISTER_TIMER("copySingleParaToDevice");
-    SetDevice device(para->getDeviceId());
-    copySingleParaToDevice(para, PARAMETER_VALUE);
-
-    if (localUpdater_) {
-      para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-    } else {
-      // if cpu, parameter should not changes until recvParameter().
-      // if gpu, zero mem when send finish
-      if (!FLAGS_use_gpu) {
-        para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-      }
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::recv() {
-  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
-  StatPtr stat = getStat("recv");
-  FOR_TIMING(Timer timer);
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("recv_dequeue");
-      pid = recvQueue_.dequeue();
-    }
-    if (pid == kFinishBatchPid) {
-      Parameter* para = NULL;
-      FOR_TIMING(timer.start());
-      recv(para);
-      FOR_TIMING(timer.stop());
-      FOR_TIMING(stat->addSample(timer.get()));
-      FOR_TIMING(timer.reset());
-      finishBatchCond_.notify_all([this] { oneBatchFinished_ = true; });
-    } else {
-      if (stopping_) break;
-      Parameter* para = parameters_[pid].get();
-      FOR_TIMING(timer.start());
-      recv(para);
-      FOR_TIMING(timer.stop());
-      oneBatchFinished_ = false;
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::send() {
-  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
-  StatPtr stat = getStat("send");
-  FOR_TIMING(Timer timer);
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("send_dequeue");
-      pid = sendQueue_.dequeue();
-    }
-    if (pid == kFinishBatchPid) {
-      batchStatus_ = BATCH_FINISH;
-      if (!localUpdater_) {
-        // if cpu, parameter should not changes until recvParameter().
-        // if gpu, zeroMem() at the end of batch so that it won't
-        // interfere with computation.
-        if (FLAGS_use_gpu) {
-          REGISTER_TIMER("para_zeroMem");
-          for (auto& para : parameters_) {
-            SetDevice device(para->getDeviceId());
-            para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-          }
-        }
-      }
-      Parameter* para = NULL;
-      FOR_TIMING(timer.start());
-      send(para);
-      FOR_TIMING(timer.stop());
-      FOR_TIMING(stat->addSample(timer.get()));
-      FOR_TIMING(timer.reset());
-      recvQueue_.enqueue(pid);
-    } else {
-      if (stopping_) break;
-      Parameter* para = parameters_[pid].get();
-      if (localUpdater_) {
-        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-      }
-      FOR_TIMING(timer.start());
-      send(para);
-      FOR_TIMING(timer.stop());
-      recvQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::updateImpl(Parameter* para) {
-  REGISTER_TIMER("update");
-  if (localUpdater_) {
-    localUpdater_->update(para);
-    if (!needToUpdateRemotely()) {
-      return;
-    }
-  }
-  sendQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
-}
-
-void ConcurrentRemoteParameterUpdater::copySingleParaToDevice(
-    Parameter* para, ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int i = nonStaticParaIDMap_[para->getID()];
-  para->getBuf(parameterType)
-      ->copyFrom(*cpuParameters_[i]->getBuf(parameterType),
-                 kHostToDeviceStream);
-  if (parameterType == PARAMETER_VALUE) {
-    para->setValueUpdated();
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::copySingleParaFromDevice(
-    Parameter* para, ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int i = nonStaticParaIDMap_[para->getID()];
-  cpuParameters_[i]
-      ->getBuf(parameterType)
-      ->copyFrom(*para->getBuf(parameterType), kDeviceToHostStream);
-}
-
-SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
-    const OptimizationConfig& config, int expectedPassCount, bool testing)
-    : config_(config),
-      passCount_(0),
-      expectedPassCount_(expectedPassCount),
-      testing_(testing),
-      useApplyInPserver_(false) {}
-
-void SparseRemoteParameterUpdater::init(
-    const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  parameterClient_.reset(new ParameterClient2(
-      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
-  parameterClient_->init(parameters_);
-  parameterClient_->setTrainerId(FLAGS_trainer_id);
-
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(
-        config_, FLAGS_save_dir, true /*is_sparse_server*/);
-    if (parameters[0]->isFullSize()) {
-      parameterClient_->setParameter();
-    } else {  // init in pserver
-      parameterClient_->setParameterZero();
-    }
-  }
-  if (FLAGS_trainer_id == 0 && !testing_ &&
-      config_.algorithm() == TrainAlgorithm::SGD) {
-    startController();
-    useApplyInPserver_ = useApplyInPserver(config_);
-  }
-}
-
-void SparseRemoteParameterUpdater::startController() {
-  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
-}
-
-void SparseRemoteParameterUpdater::controller() {
-  ParameterClient2 client(
-      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse);
-  client.init(parameters_);
-
-  while (true) {
-    /*start pass*/ {
-      client.waitPassStart();
-
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ false,
-                         /* sendBackarameter= */ false,
-                         /* releasePass= */ false);
-    }
-
-    while (true) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_SGD);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ false);
-      if (client.isPassFinish()) {
-        break;
-      }
-    }
-
-    /*finish pass*/ {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ true);
-    }
-
-    passCount_++;
-    if (passCount_ == expectedPassCount_) {
-      break;
-    }
-  }
-}
-
-PassType SparseRemoteParameterUpdater::startBatch(int64_t batchSize) {
-  batchSize_ = batchSize;
-  return PASS_TRAIN;
-}
-
-void SparseRemoteParameterUpdater::finishBatch(real cost) {
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-
-  ParameterType sendType = PARAMETER_GRADIENT;
-
-  REGISTER_TIMER("sendSparseParam");
-  parameterClient_->sendAndReceiveParameter(mode,
-                                            sendType,
-                                            batchSize_,
-                                            0,       // cost = 0
-                                            false);  // sendBackParameter
-
-  // grad zero move to sgd grad machine, before merge grad sparse remote
-}
-
-void SparseRemoteParameterUpdater::startPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassStart();
-  } else {
-    if (FLAGS_trainer_id == 0) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      parameterClient_->doOperation(ops,
-                                    /* waitForGradient= */ false,
-                                    /* sendBackarameter= */ false);
-    }
-    parameterClient_->asyncStartPass();
-  }
-}
-
-bool SparseRemoteParameterUpdater::finishPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassFinish();
-  } else {
-    if (FLAGS_trainer_id == 0) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      parameterClient_->doOperation(ops,
-                                    /* waitForGradient= */ false,
-                                    /* sendBackarameter= */ false);
-    }
-    parameterClient_->asyncFinishPass();
-  }
-
-  return true;
-}
-
-// Trainer will call getParametersRemote at batch start or before save,
-// so we do not get values in apply() and restore().
-void SparseRemoteParameterUpdater::apply() {
-  if (useApplyInPserver_) {
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_APPLY);
-    parameterClient_->doOperation(ops,
-                                  /* waitForGradient= */ false,
-                                  /* sendBackarameter= */ false);
-  }
-}
-
-void SparseRemoteParameterUpdater::restore() {}
-
-void SparseRemoteParameterUpdater::getParametersRemote(bool fullSize,
-                                                       bool apply) {
-  ParameterType sendBackParameterType =
-      (useApplyInPserver_ && apply) ? PARAMETER_APPLY : PARAMETER_VALUE;
-  std::function<void()> getParams;
-  std::function<void(Parameter&, real)> applyL1;
-  if (fullSize) {
-    getParams = [&] {
-      parameterClient_->getParameter(
-          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-    };
-    applyL1 = [](Parameter& para, real decayRate) {
-      para.getBuf(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
-    };
-  } else {
-    getParams = [&] {
-      parameterClient_->getParameterSparse(
-          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-    };
-    applyL1 = [](Parameter& para, real decayRate) {
-      para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
-    };
-  }
-  {
-    REGISTER_TIMER("getParamDenseAndSparse");
-    getParams();
-    if (config_.shrink_parameter_value() > 0) {
-      for (auto& para : parameters_) {
-        if (para->getConfig().decay_rate_l1() > 0) {
-          applyL1(*para, config_.shrink_parameter_value());
-        }
-      }
-    }
-  }
-}
-
-void SparseRemoteParameterUpdater::randParametersRemote() {
-  CHECK_EQ(FLAGS_trainer_id, 0);
-
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_RANDOMIZE);
-  parameterClient_->doOperation(ops,
-                                /* waitForGradient= */ false,
-                                /* sendBackarameter= */ false);
-}
-
-void SparseRemoteParameterUpdater::loadParametersRemote(
-    const std::string& dirName) {
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->loadValueVector(dirName);
-  }
-
-  if (testing_) {
-    // we do not use synchronize() here,
-    // because test mode may run only one tester
-    if (FLAGS_trainer_id == 0) {
-      parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
-    } else {
-      parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
-    }
-  }
-}
-
-void SparseRemoteParameterUpdater::saveParametersRemote(
-    const std::string& dirName) {
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->saveValueVector(dirName);
-  }
-}
-
-void SparseRemoteParameterUpdaterComposite::init(
-    const std::vector<ParameterPtr>& parameters) {
-  parameters_ = parameters;
-
-  std::vector<ParameterPtr> parametersArray[NUMBER_UPDATERS];
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      parametersArray[UPDATER_SPARSE_REMOTE].push_back(para);
-    } else {
-      parametersArray[UPDATER_NORMAL].push_back(para);
-    }
-  }
-  CHECK(!parametersArray[UPDATER_SPARSE_REMOTE].empty());
-  CHECK(!parametersArray[UPDATER_NORMAL].empty());
-
-  syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-    updaters_[tid]->init(parametersArray[tid]);
-  });
-
-  parameterTypes_ = updaters_[UPDATER_NORMAL]->getParameterTypes();
-}
-
-std::vector<std::function<ParameterUpdater*(
-    const std::string&, const OptimizationConfig&, bool, size_t)>>
-    ParameterUpdaterCreators::constructors_;
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/RemoteParameterUpdater.h b/paddle/legacy/trainer/RemoteParameterUpdater.h
deleted file mode 100644
index 684685329..000000000
--- a/paddle/legacy/trainer/RemoteParameterUpdater.h
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <thread>
-#include "ParameterUpdater.h"
-#include "paddle/legacy/pserver/ParameterClient2.h"
-#include "paddle/legacy/utils/Queue.h"
-#include "paddle/legacy/utils/Util.h"
-
-namespace paddle {
-
-// TODO(yanfei):
-// I think that the biggest feature of rdma is packet lossless control
-// feature instead of high bandwiths, zero copy and gpu-direct rdma in
-// theroy.
-// But zero-copy and gpu-direct rdma features can help to reduce latency
-// caused by os system.
-// So, for some specified cluster, such as high density gpu cluster,
-// gpu-direct and zero copy could help to improve cluster communication
-// performance.
-//
-
-/**
- * Normal remote parameter updater for dense parameters.
- *
- * It first packs all parameters for all pservers using ParameterClient
- * module, then wait for merged parameters data from all pservers.
- * The synchronization pattern specified by sync-sgd or async-sgd is
- * achieved by all pservers with the help of the controller within this
- * remote parameter updater.
- * This module indeedly bridges the gradient machines and parameter servers.
- * It helps to transfer the parameters from acceleration device to cpu end
- * for network. It contains additional parameters copy buffers for
- * acceleration devices at cpu end, such as gpu, otherwise it will
- * directly use original parameters data to update pservers.
- *
- * This remote parameter updater does not use pipeline mechanism to hide
- * copy latency from gpu to cpu buffer. In addition the overlapped between
- * backward and communication is not supported.
- */
-class RemoteParameterUpdater : public ParameterUpdater {
- public:
-  RemoteParameterUpdater(
-      const OptimizationConfig& config,
-      int expectedPassCount,
-      std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
-  ~RemoteParameterUpdater() {
-    if (controllerThread_) {
-      controllerThread_->join();
-    }
-  }
-
-  /**
-   * initialize the internal parameter client and itself.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  /**
-   * @brief start batch
-   *
-   * @note  one batch training exhibits stateful feature to help
-   *        to do performance tuning, sgd optimization if necessary.
-   */
-  virtual PassType startBatch(int64_t batchSize) {
-    if (localUpdater_) {
-      localUpdater_->startBatch(batchSize);
-    }
-    batchSize_ = batchSize;
-    batchStatus_ = BATCH_START;
-    return PASS_TRAIN;
-  }
-
-  /**
-   * send parameters to pservers and get returned parameters
-   * from all pservers if necessary. it will implictly
-   * cooperate with controller thread for sync-sgd.
-   */
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    parameterClient_->setForwardbackwardTime(delta);
-  }
-#endif
-
-  virtual void apply();
-  virtual void restore();
-
- protected:
-  /**
-   * control all pservers with all trainers for sync-sgd
-   */
-  virtual void controller();
-
-  /**
-   * work need to do after finishBatch
-   */
-  virtual void updateImpl(Parameter* para);
-
-  void startController();
-
-  /**
-   * @brief copy parameters from cpu host to device, such as gpu.
-   *
-   * @note  return if all data are transfered.
-   */
-  void copyParametersToDevice(ParameterType parameterType);
-
-  /**
-   * @brief copy parameters from device to cpu host
-   *
-   * @note  return if all data are transfered
-   */
-  void copyParametersFromDevice(ParameterType parameterType);
-
- protected:
-  /// Optimization config used to guide initialization and finishBatch
-  OptimizationConfig config_;
-  /// internal parameter client object for exchanging data with pserver
-  std::unique_ptr<ParameterClient2> parameterClient_;
-  /// internal shadow buffer at cpu host end, use original parameters_
-  /// if no acceleration devices are used.
-  std::vector<ParameterPtr> cpuParameters_;
-  /// local updater for aggregating multi-batches local delta
-  std::unique_ptr<ParameterUpdater> localUpdater_;
-  /// the size of mini-batch
-  int64_t batchSize_;
-  /// batches passed
-  int64_t numBatches_;
-  /// for stateful control
-  BatchStatus batchStatus_;
-  /// controller thread for sync-sgd
-  std::unique_ptr<std::thread> controllerThread_;
-  /// passed already finished
-  int64_t passCount_;
-  /// expected passes to finished
-  int64_t expectedPassCount_;
-  /// use normal synchronization communication if True
-  bool separateSendAndRecv_;
-  /// true if it's first pass
-  bool isFirstPass_;
-  bool useApplyInPserver_;
-
-  static const std::string kAverage;
-  static const std::string kElasticAverage;
-};
-
-// TODO(yanfei):
-// do parameters level synchronization Optimization at pserver end with
-// ConcurrentRemoteParameterUpdater to get more parallelization, at last
-// to really hide pserver latency in backward computation.
-//
-/**
- * This updater add additional optimization for overlapping synchronization
- * from pservers with backward computation.
- *
- * Parameter can be sent to pservers when related backward stage is finished.
- * This concurrent udpater does data copy from acceleration device to host
- * memory aynchronously. In addition internal parameter client reads data in
- * host memory and send them to all pservers in next stage. So this class
- * help to pipeline device-to-host copy and host-to-network to hide network
- * latency in backward stage.
- * It contains separate send and recv thread for pipeline usage.
- */
-class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
- public:
-  ConcurrentRemoteParameterUpdater(
-      OptimizationConfig config,
-      int expectedPassCount,
-      std::unique_ptr<ParameterUpdater>&& localUpdater);
-  ~ConcurrentRemoteParameterUpdater();
-
-  /**
-   * @brief send paraemeters to all pservers
-   *
-   * @note  it just signal the end signal to internal parameter client
-   *        to finished the aynchronous send action. In addition it also
-   *        do synchronization for all asynchronous host-to-device copy.
-   */
-  virtual void finishBatch(real cost);
-
- protected:
-  virtual void updateImpl(Parameter* para);
-  /// internal thread called in send thread
-  void send(Parameter* para);  // para == NULL indicate end of a minibatch
-  /// internal function called in recv thread
-  void recv(Parameter* para);
-  /**
-   * @brief send thread for relaying data from gradient to parameter client
-   *
-   * @note  just pipe data to internal parameter client for pipeline
-   */
-  void send();
-  /**
-   * @brief recv thread for relaying data from internal parameter client to
-   *        host memory
-   *
-   * @note  it contains the asynchronous data copy form host to device
-   */
-  void recv();
-  /// copy specified parameter from host to device
-  void copySingleParaToDevice(Parameter* para, ParameterType parameterType);
-  /// copy specified parameter from device to host
-  void copySingleParaFromDevice(Parameter* para, ParameterType parameterType);
-  bool needToUpdateRemotely() {
-    return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
-  }
-
- private:
-  /// send thread used for overlapping
-  std::unique_ptr<std::thread> sendThread_;
-  /// recv thread used for overlapping
-  std::unique_ptr<std::thread> recvThread_;
-  /// buffer queue for overlapping
-  Queue<int> sendQueue_;
-  /// buffer queue for overlapping
-  Queue<int> recvQueue_;
-  /// flags indicating to stop
-  bool stopping_;
-  /// conditional variable for threads synchronization between the
-  /// thread calling finishBatch and internal recv thread
-  LockedCondition finishBatchCond_;
-  bool oneBatchFinished_;
-};
-
-// TODO(yanfei):
-// merge sparse updater with dense updater, and could help to reduce
-// the synchronization between sparse and dense udpater. it could also
-// reduce the threads for managing all connections.
-/**
- * This class is specified for updating sparse parameters.
- *
- * It allows part of parameter to be exchanged with all pservers.
- * If sparse input assigned, part gradients of first hidden layer
- * could remained zero which can not need to be exchanged within
- * all pservers. This is the key optimization point for this updater
- *
- * For updating sparse parameters, all latest parameters are stored
- * in pservers instead of keeping full copy at train end, so need to
- * prefetch parameters weight value which can be changed in next-batch
- * before doing next forwardbackward. Also, with above fact that the
- * parameters can be stored in pserver instead of trainer, we can
- * fetch specified parmeters if necessary, and can support huge
- * parameters which is larger enough than  the RAM size in single
- * node.
- *
- * Internally, this updater will direct internal parameter client
- * to encapsulate sparse specified message for all pservers.
- */
-class SparseRemoteParameterUpdater : public ParameterUpdater {
- public:
-  SparseRemoteParameterUpdater(const OptimizationConfig& config,
-                               int expectedPassCount,
-                               bool testing);
-  ~SparseRemoteParameterUpdater() {
-    if (controllerThread_) {
-      controllerThread_->join();
-    }
-  }
-
-  /// initialization
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  /// stateful batch control
-  virtual PassType startBatch(int64_t batchSize);
-  /// send all sparse related parameters to all pservers
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
-  virtual void apply();
-  virtual void restore();
-
-  /// load parameters from pservers
-  virtual void loadParametersRemote(const std::string& dirName);
-  /// save parameters to pservers
-  virtual void saveParametersRemote(const std::string& dirName);
-  /**
-   * @brief get latest sparse parameters value from all pservers
-   *
-   * @note  call it before next mini-batch
-   */
-  virtual void getParametersRemote(bool fullSize, bool apply);
-  virtual void randParametersRemote();
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    parameterClient_->setForwardbackwardTime(delta);
-  }
-#endif
-
- protected:
-  /// update implimentation, not implemented
-  virtual void updateImpl(Parameter* para) {}
-
-  /// internal controller routine for controller thread
-  virtual void controller();
-
-  /// start controller thread
-  void startController();
-
- protected:
-  /// optimization config
-  OptimizationConfig config_;
-  /// internal parameter client
-  std::unique_ptr<ParameterClient2> parameterClient_;
-  int64_t batchSize_;
-  std::unique_ptr<std::thread> controllerThread_;
-  int64_t passCount_;
-  int64_t expectedPassCount_;
-  bool testing_;
-  bool useApplyInPserver_;
-};
-
-/**
- * Class for supporting normal updater and sparse updater
- *
- * Not all parts of one model are sparse, so it exists dense updater
- * for normal layers while sparse updater is for sparse layers.
- *
- * it directly call internal dense and sparse udpater individually.
- */
-class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
- public:
-  enum {
-    UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
-    UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
-    NUMBER_UPDATERS = 2,
-  };
-  /**
-   * @brief create one dense updater and one sparse updater
-   *
-   * @note  use syncThreadPool to synchronize these two updaters
-   */
-  SparseRemoteParameterUpdaterComposite(
-      const OptimizationConfig& config,
-      int expectedPassCount,
-      bool testing,
-      std::unique_ptr<ParameterUpdater>&& normalUpdater) {
-    updaters_.resize(NUMBER_UPDATERS);
-    updaters_[UPDATER_SPARSE_REMOTE].reset(
-        new SparseRemoteParameterUpdater(config, expectedPassCount, testing));
-    updaters_[UPDATER_NORMAL] = std::move(normalUpdater);
-
-    syncThreadPool_.reset(new SyncThreadPool(NUMBER_UPDATERS - 1));
-  }
-
-  /// initialization of dense and sparse updaters
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-};
-
-class ParameterUpdaterCreators {
- public:
-  /**
-   * @brief add a creator to create custom ParameterUpdater while training.
-   *        The creator is a function with type (alogrithm, optConfig, isLocal,
-   *        numPasses) -> ParameterUpdater*. Trainer will use this
-   *        ParameterUpdater if creator can create a no nullptr
-   *        ParameterUpdater. Return nullptr will use trainer's default
-   *        updaters.
-   *
-   * @param creator method which can create ParameterUpdater.
-   */
-  static void addCreator(
-      const std::function<ParameterUpdater*(
-          const std::string&,         // algo
-          const OptimizationConfig&,  // optConfig
-          bool,                       // isLocal
-          size_t                      // numPasses
-          )>& creator) {  // NOLINT  explicit move closing ) in this line
-                          // for readability
-    constructors_.push_back(creator);
-  }
-
-  /**
-   * @brief Try to create an updater by given algo, optConfig, isLocal,
-   *        numPasses. Return nullptr if cannot create anyone.
-   * @param algo algorithm string.
-   * @param optConfig optimization config.
-   * @param isLocal is in local mode or not.
-   * @param numPasses total passes that trainer will train.
-   * @return nullptr if fail, not nullptr if we can create an updater.
-   */
-  static ParameterUpdater* tryCreateUpdater(const std::string& algo,
-                                            const OptimizationConfig& optConfig,
-                                            bool isLocal,
-                                            size_t numPasses) {
-    for (auto& c : constructors_) {
-      if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
-        return updater;
-      }
-    }
-    return nullptr;
-  }
-
- private:
-  static std::vector<std::function<ParameterUpdater*(
-      const std::string&, const OptimizationConfig&, bool, size_t)>>
-      constructors_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/Tester.cpp b/paddle/legacy/trainer/Tester.cpp
deleted file mode 100644
index d977ca965..000000000
--- a/paddle/legacy/trainer/Tester.cpp
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Tester.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "TesterConfig.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/ValidationLayer.h"
-
-namespace paddle {
-
-Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
-               std::unique_ptr<TesterConfig>&& intconfig,
-               const GradientMachinePtr& gradientMachine,
-               const std::shared_ptr<ParameterUpdater>& parameterUpdater,
-               std::shared_ptr<DataProvider> testDataProvider)
-    : config_(config),
-      intconfig_(std::move(intconfig)),
-      gradientMachine_(gradientMachine),
-      parameterUpdater_(parameterUpdater),
-      testDataProvider_(testDataProvider) {
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    LOG(FATAL) << "It's prohibited to set sparse_remote_update "
-               << "when doing train and test jobs in the same "
-               << "process. You could run paddle --job=test in "
-               << "a separate process.";
-  }
-  testEvaluator_.reset(gradientMachine_->makeEvaluator());
-  if (intconfig_->distributeTest) {
-    testParameterClient_.reset(new ParameterClient2(true));
-  }
-
-  if (testParameterClient_) {
-    testParameterClient_->init(gradientMachine_->getParameters());
-  }
-
-  std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(intconfig_->saveOnlyOne,
-                              intconfig_->savingPeriod,
-                              intconfig_->loadsaveParametersInPserver,
-                              intconfig_->config));
-
-  paramUtil_.reset(new ParameterUtil(
-      config_, std::move(paramConfig), gradientMachine_, parameterUpdater_));
-}
-
-void Tester::startTestPeriod() {
-  if (testDataProvider_) {
-    testDataProvider_->reset();
-  }
-  testEvaluator_->start();
-  testContext_.cost = 0;
-  testContext_.numSamples = 0;
-
-  parameterUpdater_->apply();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->trainState);
-    gradientMachine_->setState(*intconfig_->testState);
-  }
-}
-
-void Tester::testOneDataBatch(const DataBatch& dataBatch,
-                              std::vector<Argument>* outArgs) {
-  testContext_.cost +=
-      forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs);
-  testContext_.numSamples += dataBatch.getSize();
-}
-
-void Tester::testOnePeriod() {
-  DataBatch dataBatch;
-  int64_t batchSize = config_->getOptConfig().batch_size();
-  std::vector<Argument> outArgs;
-  startTestPeriod();
-  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
-    testOneDataBatch(dataBatch, &outArgs);
-  }
-  finishTestPeriod();
-}
-
-void Tester::finishTestPeriod() {
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->resetState();
-  }
-  testEvaluator_->finish();
-  CHECK_GT(testContext_.numSamples, 0)
-      << "There is no samples in your test batch. Possibly "
-         "wrong implementation of DataProvidor.reset()";
-  LOG(INFO) << " Test samples=" << testContext_.numSamples
-            << " cost=" << testContext_.cost / testContext_.numSamples
-            << " Eval: " << *testEvaluator_;
-  parameterUpdater_->restore();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->testState);
-    gradientMachine_->setState(*intconfig_->trainState);
-  }
-}
-
-int64_t Tester::testOneBatchById(int64_t batchId) {
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-
-  testDataProvider_->getNextBatch(batchSize, &dataBatch);
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return 0;
-  }
-
-  std::vector<Argument> outArgs;
-
-  stats_ += std::pair<int64_t, real>{
-      actualBatchSize,
-      forwardOneBatch(dataBatch, testEvaluator_.get(), &outArgs)};
-
-  if (((batchId + 1) % intconfig_->logPeriod) == 0) {
-    LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
-  }
-
-  return actualBatchSize;
-}
-
-real Tester::forwardOneBatch(const DataBatch& dataBatch,
-                             Evaluator* evaluator,
-                             std::vector<Argument>* pOutArgs) {
-  auto& outArgs = *pOutArgs;
-  const std::vector<Argument>& inArgs = dataBatch.getStreams();
-  if (intconfig_->loadsaveParametersInPserver) {
-    REGISTER_TIMER("prefetch");
-    gradientMachine_->prefetch(inArgs);
-    parameterUpdater_->getParametersRemote(false /*full parameter*/,
-                                           true /*after apply*/);
-  }
-
-  gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);
-
-  // write features if set this flag and outArgs is not empty
-  std::string featFile = intconfig_->featFile;
-  if (!featFile.empty() && outArgs.empty()) {
-    size_t numOutputs = outArgs.size();
-    std::vector<MatrixPtr> featMatrices;
-    featMatrices.resize(numOutputs);
-    for (size_t i = 0; i < numOutputs; ++i) {
-      featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
-                                       outArgs[i].value->getWidth(),
-                                       false,
-                                       false);  // CPU data buffer
-      featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    FILE* fp = fopen(featFile.c_str(), "ab+");
-    CHECK(!ferror(fp)) << "Fail to open " << featFile;
-
-    size_t sampleNum = featMatrices[0]->getHeight();
-    for (size_t i = 0; i < sampleNum; ++i) {
-      for (size_t j = 0; j < numOutputs; ++j) {
-        size_t dim = featMatrices[j]->getWidth();
-        fwrite(featMatrices[j]->getData() + i * dim, sizeof(real), dim, fp);
-      }
-    }
-    fclose(fp);
-  }
-  if (evaluator) {
-    gradientMachine_->eval(evaluator);
-  }
-
-  // Save the output layers if predict_output_dir is not empty
-  std::string predictOutputDir = intconfig_->predictOutputDir;
-  if (!predictOutputDir.empty() && !outArgs.empty()) {
-    CHECK(intconfig_->testing) << "Only valid in test mode";
-    if (!os_.is_open()) {
-      // TODO(yuyang18): Refactor these lines.
-      constexpr int kBufLen = 100;
-      char buf[kBufLen];
-      snprintf(buf, kBufLen, "rank-%05d", intconfig_->trainerId);
-      mkDir(predictOutputDir.c_str());
-      std::string filename = path::join(predictOutputDir, buf);
-      os_.open(filename, std::ofstream::trunc);
-      CHECK(os_.is_open()) << "Failed to open file " << filename;
-    }
-    printOutput(outArgs, os_);
-    return 0.0;  // In this case, there is no meaning to calculate cost
-  }
-
-  return Argument::sum(outArgs);
-}
-
-void Tester::testOnePassBatch(int passId) {
-  stats_.reset();
-  const std::vector<Argument> inArgs;
-  gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
-  int64_t num;
-  real cost;
-  gradientMachine_->getStats(cost, num);
-  stats_ += std::pair<int64_t, real>{num, cost};
-  gradientMachine_->onPassEnd();
-
-  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
-}
-
-void Tester::testOnePass(int passId) {
-  stats_.reset();
-  int64_t batchId = 0;
-  int num = 0;
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->resetState();
-  }
-
-  testEvaluator_->start();
-
-  do {
-    num = testOneBatchById(batchId);
-    ++batchId;
-  } while (num > 0);
-
-  gradientMachine_->onPassEnd();
-  testEvaluator_->finish();
-
-  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false)
-            << " Eval: " << *testEvaluator_;
-
-  if (intconfig_->distributeTest) {
-    testEvaluator_->distributeEval(testParameterClient_.get());
-    if (0 == intconfig_->trainerId) {
-      LOG(INFO) << "distribute eval: " << *testEvaluator_;
-    }
-  }
-}
-
-void Tester::test() {
-  CHECK(testDataProvider_) << "TestData is not specified";
-  testDataProvider_->setSkipShuffle();
-  testDataProvider_->reset();
-  gradientMachine_->start();
-
-  // For evaluation
-  std::vector<std::string> modelList;
-  std::string modelListFromConfig = intconfig_->modelList;
-  std::string initModelPath = intconfig_->initModelPath;
-  if (!modelListFromConfig.empty()) {
-    loadFileList(modelListFromConfig, modelList);
-    intconfig_->testPass = 0;
-    intconfig_->numPasses = modelList.size();
-    intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
-  } else if (!initModelPath.empty()) {
-    modelList.push_back(initModelPath);
-    intconfig_->testPass = 0;
-    intconfig_->numPasses = 1;
-    intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
-  }
-
-  for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
-    int passId = i;
-    if (passId % intconfig_->savingPeriod == 0) {
-      if (intconfig_->testWait) {
-        while (paramUtil_->loadParameters(
-                   passId, true /*local*/, true /*remote*/) == false) {
-          LOG(INFO) << "Waiting for parameters of pass " << passId;
-          sleep(60);  // sleep 60s
-        }
-      } else {
-        if (modelList.size() == 0) {
-          CHECK_EQ(paramUtil_->loadParameters(
-                       passId, true /*local*/, true /*remote*/),
-                   true);
-        } else {
-          paramUtil_->loadParametersWithPath(
-              modelList[i], true /*local*/, true /*remote*/);
-        }
-      }
-      if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
-        testOnePassBatch(passId);
-      } else {
-        testOnePass(passId);
-      }
-      if (passId + intconfig_->savingPeriod < intconfig_->numPasses) {
-        // if there is at least 1 more pass to test, then call reset,
-        // otherwise not.
-        testDataProvider_->reset();
-      }
-    }
-  }
-
-  gradientMachine_->finish();
-}
-
-void Tester::printOutput(const std::vector<Argument>& outArgs,
-                         std::ostream& os) {
-  size_t numOutputs = outArgs.size();
-  size_t numIns = outArgs[0].getBatchSize();
-  if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
-    cpuMat_.resize(numOutputs, nullptr);
-    cpuVec_.resize(numOutputs, nullptr);
-  }
-
-  for (size_t i = 0; i < numOutputs; ++i) {
-    if (outArgs[i].value != nullptr) {
-      if (outArgs[i].value->useGpu()) {
-        if (dynamic_cast<GpuMatrix*>(outArgs[i].value.get())) {
-          size_t dim = outArgs[i].value->getWidth();
-          Matrix::resizeOrCreate(cpuMat_[i], numIns, dim, false, false);
-          cpuMat_[i]->copyFrom(*outArgs[i].value);
-        } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
-          auto sparseMat =
-              dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
-          cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(),
-                                                  sparseMat->getWidth(),
-                                                  sparseMat->getElementCnt(),
-                                                  sparseMat->getValueType(),
-                                                  sparseMat->format_,
-                                                  false,  /* trans */
-                                                  false); /* useGpu */
-          hl_stream_t stream = HPPL_STREAM_DEFAULT;
-          cpuMat_[i]->copyFrom(*sparseMat, stream);
-        } else {
-          LOG(WARNING) << "Not supported gpu matrix type";
-        }
-      }
-    } else if (outArgs[i].ids != nullptr) {
-      if (outArgs[i].ids->useGpu()) {
-        IVector::resizeOrCreate(cpuVec_[i], outArgs[i].ids->getSize(), false);
-        cpuVec_[i]->copyFrom(*outArgs[i].ids);
-      }
-    } else if (outArgs[i].strs != nullptr) {
-      continue;
-    } else {
-      LOG(WARNING) << "outArgs[" << i << "] has no data to print";
-    }
-  }
-
-  for (size_t i = 0; i < numIns; ++i) {
-    for (size_t j = 0; j < numOutputs; ++j) {
-      if (outArgs[j].value != nullptr) {
-        if (outArgs[j].value->useGpu()) {
-          cpuMat_[j]->printOneRow(os, i);
-        } else {
-          outArgs[j].value->printOneRow(os, i);
-        }
-      } else if (outArgs[j].ids != nullptr) {
-        if (outArgs[j].ids->useGpu()) {
-          cpuVec_[j]->printOneElement(os, i);
-        } else {
-          outArgs[j].ids->printOneElement(os, i);
-        }
-      } else if (outArgs[j].strs != nullptr) {
-        os << (*outArgs[j].strs)[i] << ";";
-      }
-    }
-    os << std::endl;
-  }
-}
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/Tester.h b/paddle/legacy/trainer/Tester.h
deleted file mode 100644
index a298602d1..000000000
--- a/paddle/legacy/trainer/Tester.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParamUtil.h"
-#include "ParameterUpdater.h"
-#include "TesterConfig.h"
-#include "TrainerInternalConfig.h"
-
-namespace paddle {
-
-/**
- * Neural Network test logics code.
- * It is a private class for Trainer.
- */
-class Tester {
- public:
-  /**
-   * Ctor
-   * @param config Trainer Config.
-   * @param intconfig Tester Config.
-   * @param gradientMachine Gradient machine(neuralnetwork) that will be tested.
-   * @param parameterUpdater Parameter Updater. Not for updating parameter, just
-   *                         for getting parameter from parameter-server.
-   * @param testDataProvider Test data provider.
-   */
-  Tester(const std::shared_ptr<TrainerConfigHelper>& config,
-         std::unique_ptr<TesterConfig>&& intconfig,
-         const GradientMachinePtr& gradientMachine,
-         const std::shared_ptr<ParameterUpdater>& parameterUpdater,
-         std::shared_ptr<DataProvider> testDataProvider);
-
-  /**
-   * test one period.
-   *
-   * One period means 2 things.
-   *   if test_period !=0 and not test_all_data_in_one_period, then
-   *      will test test_period * batch_size data.
-   *   else
-   *      will test whole test data.
-   *
-   * It is convenience to test small set of data when test data set is large and
-   * is training at same time.
-   */
-  void testOnePeriod();
-  void startTestPeriod();
-  void finishTestPeriod();
-  void testOneDataBatch(const DataBatch& dataBatch,
-                        std::vector<Argument>* outArgs);
-
-  /**
-   * Test for given data batch.
-   * @param dataBatch Data batch.
-   * @param evaluator Evaluator
-   * @return cost
-   */
-  real forwardOneBatch(const DataBatch& dataBatch,
-                       Evaluator* evaluator,
-                       std::vector<Argument>* outArgs);
-
-  /**
-   * performance the full pass of test given test data provider
-   */
-  void test();
-
- protected:
-  std::shared_ptr<ParameterClient2> testParameterClient_;
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<TesterConfig> intconfig_;
-  GradientMachinePtr gradientMachine_;
-  std::shared_ptr<ParameterUpdater> parameterUpdater_;
-  std::unique_ptr<Evaluator> testEvaluator_;
-  std::unique_ptr<ParameterUtil> paramUtil_;
-  DataProviderPtr testDataProvider_;
-  TrainerStats stats_;
-
-  // Used for saving the values of output layers
-  std::ofstream os_;
-  std::vector<MatrixPtr> cpuMat_;
-  std::vector<IVectorPtr> cpuVec_;
-  struct {
-    int64_t numSamples;
-    real cost;
-  } testContext_;
-
- private:
-  /**
-   * Test one batch by batchId. It is only used for testOnePass.
-   *
-   * Durning testOnePass, each log_period will print cost statistics.
-   *
-   * @param batchId current batch id (from 0)
-   * @return num of tested samples. Zero if end of pass.
-   */
-  int64_t testOneBatchById(int64_t batchId);
-
-  /**
-   * Test whole pass in one batch.
-   *
-   *
-   * @param passId current pass id (from 0)
-   */
-  void testOnePassBatch(int passId);
-
-  /**
-   * test for one pass in several mini-batches.
-   *
-   * Used for sgd method.
-   *
-   * @param passId current pass id (from 0)
-   */
-  void testOnePass(int passId);
-
-  /**
-   * print the outArgs to a stream
-   *
-   * used for save feature file
-   *
-   * @param [in] outArgs output arguments for network.
-   * @param [in,out] os output stream.
-   */
-  void printOutput(const std::vector<Argument>& outArgs, std::ostream& os);
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/trainer/TesterConfig.h b/paddle/legacy/trainer/TesterConfig.h
deleted file mode 100644
index 6c78f7cda..000000000
--- a/paddle/legacy/trainer/TesterConfig.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParameterUpdater.h"
-
-namespace paddle {
-
-/**
- * TesterConfig
- * general configs for training
- */
-struct TesterConfig {
-  /**
-   * indicate test period
-   */
-  int testPeriod;
-
-  /**
-   * indicate whether to save previous batch state
-   */
-  bool prevBatchState;
-
-  /**
-   * log period
-   */
-  int logPeriod;
-
-  /**
-   * loadsave parameters in pserver
-   */
-  bool loadsaveParametersInPserver;
-
-  /**
-   * feat file
-   */
-  std::string featFile;
-
-  /**
-   * predict output dir
-   */
-  std::string predictOutputDir;
-
-  /**
-   * trianer id
-   */
-  int trainerId;
-
-  /**
-   * distribute test
-   */
-  bool distributeTest;
-
-  /**
-   * training state
-   */
-  MachineState* trainState;
-
-  /**
-   * test state
-   */
-  MachineState* testState;
-
-  /**
-   * model list
-   */
-  std::string modelList;
-
-  /**
-   * test passes
-   */
-  int testPass;
-
-  /**
-   * num passes
-   */
-  int numPasses;
-
-  /**
-   * saving period
-   */
-  int savingPeriod;
-
-  /**
-   * test wait
-   */
-  int testWait;
-
-  /**
-   * init model path
-   */
-  std::string initModelPath;
-
-  /**
-   * save only one
-   */
-  bool saveOnlyOne;
-
-  /**
-   * testing mode
-   */
-  bool testing;
-
-  /**
-   * mode
-   */
-  int mode;
-
-  /**
-   * config loc
-   */
-  std::string config;
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/trainer/ThreadParameterUpdater.cpp b/paddle/legacy/trainer/ThreadParameterUpdater.cpp
deleted file mode 100644
index 0601bdf24..000000000
--- a/paddle/legacy/trainer/ThreadParameterUpdater.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ThreadParameterUpdater.h"
-
-#include "paddle/legacy/utils/Logging.h"
-
-#include "paddle/legacy/math/SparseRowMatrix.h"
-#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
-#include "paddle/legacy/utils/Thread.h"
-
-DECLARE_int32(trainer_count);
-
-namespace paddle {
-
-SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
-    : config_(optConfig), numSamplesProcessed_(0) {
-  // fill types
-  auto types = sgdOptimizerGetTypes(optConfig, false /*inPserver*/);
-  for (auto type : types) {
-    addParameterType(type);
-  }
-}
-
-void SgdThreadUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  // calc max parameter id
-  size_t maxId = 0;
-  for (auto& para : parameters_) {
-    maxId = std::max(maxId, para->getID());
-  }
-
-  optimizers_.resize(maxId + 1);
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid].reset(sgdOptimizerCreate(config_,
-                                              para->getConfig(),
-                                              para->isGradSparseUpdate(),
-                                              false /*inPserver*/));
-    size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
-    optimizers_[pid]->init(numRows, &para->getConfig());
-    if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
-      // For trainer_count=1, the gradient machine is NeuralNetwork, which does
-      // not create parameter buf for PARAMETER_GRADIENT for sparse update in
-      // Parameter::enableType(). But gradient parameter buf is still used
-      // in SgdThreadUpdater. We need to explicitly create it.
-      //
-      // The AverageOptimizer::restore/apply method will use PARAMETER_GRADIENT
-      // as a temp buffer.
-      para->enableBufType(PARAMETER_GRADIENT);
-    }
-  }
-}
-
-void SgdThreadUpdater::startPass() {
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->startPass();
-  }
-}
-
-bool SgdThreadUpdater::finishPass() {
-  catchUpWith();
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishPass();
-  }
-  return true;
-}
-
-void SgdThreadUpdater::updateImpl(Parameter* para) {
-  if (!para->useGpu()) return;
-  SetDevice setDevice(para->getDeviceId());
-  ParameterOptimizer* optimizer = optimizers_[para->getID()].get();
-  optimizer->update(para->getBufs(), para->getConfig());
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    callback(para->getBufs(), para->getConfig(), -1LU);
-  }
-
-  para->setValueUpdated();
-  para->clearGradient();
-}
-
-void SgdThreadUpdater::threadTraverse(
-    const ParameterOptimizer::TraverseCallback& callback,
-    int tid,
-    size_t numThreads,
-    Parameter* para) {
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-  if (para->isGradSparseUpdate()) {
-    size_t height = para->getConfig().dims(0);
-    size_t width = para->getConfig().dims(1);
-    for (size_t i = tid; i < height; i += numThreads) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
-      }
-      callback(vecs, para->getConfig(), i);
-    }
-  } else {  // dense
-    // setup sub bufs
-    auto interval = calcSplitArrayInterval(
-        para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-    for (auto type : parameterTypes_) {
-      vecs[type]->subVecFrom(*para->getBuf(type), interval);
-    }
-
-    callback(vecs, para->getConfig(), -1LU);
-  }
-}
-
-void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
-  bool hasCpuPara = false;
-  bool hasGpuPara = false;
-  for (auto& para : parameters_) {
-    if (para->useGpu()) {
-      hasGpuPara = true;
-    } else {
-      hasCpuPara = true;
-    }
-  }
-
-  auto cpuTraverse = [&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (auto callback = getTraverseCallback(para.get())) {
-        threadTraverse(callback, tid, numThreads, para.get());
-      }
-    }
-  };
-  auto gpuTraverse = [&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (para->useGpu()) {
-        if (auto callback = getTraverseCallback(para.get())) {
-          SetDevice setDevice(para->getDeviceId());
-          callback(para->getBufs(), para->getConfig(), -1LU);
-        }
-      }
-    }
-  };
-
-  if (hasCpuPara && hasGpuPara) {
-    getGlobalSyncThreadPool()->exec(cpuTraverse, gpuTraverse);
-  } else if (hasCpuPara) {
-    getGlobalSyncThreadPool()->exec(cpuTraverse);
-  } else if (hasGpuPara) {
-    gpuTraverse(0, 0);
-  }
-}
-
-void SgdThreadUpdater::catchUpWith() {
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->startCatchUpWith();
-  });
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishCatchUpWith();
-  }
-}
-
-void SgdThreadUpdater::apply() {
-  catchUpWith();
-
-  traverse(
-      [this](Parameter* para) { return optimizers_[para->getID()]->apply(); });
-}
-
-void SgdThreadUpdater::restore() {
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->restore();
-  });
-}
-
-PassType SgdThreadUpdater::startBatch(int64_t batchSize) {
-  numSamplesProcessed_ += batchSize;
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->startBatch(numSamplesProcessed_);
-  }
-  return PASS_TRAIN;
-}
-
-void SgdThreadUpdater::finishBatch(real cost) {
-  getGlobalSyncThreadPool()->exec([&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (para->isGradSparseUpdate()) {
-        threadUpdateSparse(tid, numThreads, para.get());
-      } else if (!para->useGpu()) {
-        threadUpdateDense(tid, numThreads, para.get());
-      }
-    }
-  });
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishBatch();
-  }
-}
-
-void SgdThreadUpdater::threadUpdateSparse(int tid,
-                                          size_t numThreads,
-                                          Parameter* para) {
-  int pid = para->getID();
-  ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-
-  size_t height = para->getConfig().dims(0);
-  size_t width = para->getConfig().dims(1);
-
-  if (dynamic_cast<SparseRowIdsCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get())) {
-    // From MultiGradientMachine
-    SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get());
-    std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
-
-    for (auto id : sparseIds) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
-      }
-      optimizer->update(vecs, para->getConfig(), id);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-    }
-    sparseIds.clear();
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(
-                 para->getMat(PARAMETER_GRADIENT).get())) {
-    // From NeuralNetwork
-    SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get());
-
-    std::vector<unsigned int>& localIndices =
-        mainMat->getIndexDictHandle()->localIndices;
-
-    auto interval =
-        calcSplitArrayInterval(localIndices.size(), tid, numThreads);
-    for (size_t i = interval.first; i < interval.second; ++i) {
-      auto id = localIndices[i];
-      real* row = mainMat->getLocalRow(i);
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        if (type == PARAMETER_GRADIENT) {
-          vecs[type]->subVecFrom(row, 0, width);
-        } else {
-          vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
-        }
-      }
-      optimizer->update(vecs, para->getConfig(), id);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-    }
-    // For numThreads > 1, MultiGradientMachine is used, which goes
-    // to the above branch.
-    CHECK_EQ(numThreads, 1UL);
-    mainMat->clearIndices();
-  } else {
-    auto& m = *para->getMat(PARAMETER_GRADIENT).get();
-    LOG(FATAL) << "Internal error: " << para->getName() << " "
-               << typeid(m).name();
-  }
-
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    for (size_t i = tid; i < height; i += numThreads) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
-      }
-      callback(vecs, para->getConfig(), i);
-    }
-  }
-}
-
-void SgdThreadUpdater::threadUpdateDense(int tid,
-                                         size_t numThreads,
-                                         Parameter* para) {
-  int pid = para->getID();
-  ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-
-  auto interval = calcSplitArrayInterval(
-      para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-
-  // setup sub bufs
-  for (auto type : parameterTypes_) {
-    vecs[type]->subVecFrom(*para->getBuf(type), interval);
-  }
-
-  // update
-  optimizer->update(vecs, para->getConfig());
-  vecs[PARAMETER_GRADIENT]->zeroMem();
-
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    callback(vecs, para->getConfig(), -1LU);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/ThreadParameterUpdater.h b/paddle/legacy/trainer/ThreadParameterUpdater.h
deleted file mode 100644
index 172287d4e..000000000
--- a/paddle/legacy/trainer/ThreadParameterUpdater.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/parameter/AverageOptimizer.h"
-#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
-#include "paddle/legacy/parameter/OptimizerFunctions.h"
-#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
-#include "paddle/legacy/parameter/Parameter.h"
-#include "paddle/legacy/parameter/Regularizer.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-
-/**
- * \brief A parameter updater that uses multiple threads to update parameters.
-   This parameter updater handles GPU and CPU updates differently,
-   because at the current moment, the merging on CPU is happening on the
-   main thread, and the its parameter size can be much larger than the one GPU.
-   Thus, for GPU, the parameter updates happens in updateImpl() function, which
-   is called by gradient machines as a callback function supplied to backward()
-   and forwardBackward().
-   For CPU, the parameter updates happens in separate threads maintained by this
-   class.
- */
-class SgdThreadUpdater : public ParameterUpdater {
- public:
-  explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
-  virtual ~SgdThreadUpdater() {}
-
-  // Use the startPass() function of the base optimizer.
-  virtual void startPass();
-
-  // Use the finishPass() function of the base optimizer.
-  virtual bool finishPass();
-
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  virtual PassType startBatch(int64_t batchSize);
-  // Call finishBatch for each optimizer.
-  virtual void finishBatch(real cost);
-  virtual void catchUpWith();
-  virtual void apply();
-  virtual void restore();
-
- protected:
-  // This is the function that will be eventualy called by the GradientMachine.
-  // used only for GPU update.
-  virtual void updateImpl(Parameter* para);
-  OptimizationConfig config_;
-  int64_t numSamplesProcessed_;
-
-  // One optimizers for each parameter.
-  std::vector<std::unique_ptr<ParameterOptimizer>> optimizers_;
-
-  // The update function for CPU sparse parameters.
-  void threadUpdateSparse(int tid, size_t numThreads, Parameter* para);
-
-  // The update function for CPU dense parameters.
-  void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
-  // The update function for after update operations, such as averager.
-  void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
-                      int tid,
-                      size_t numThreads,
-                      Parameter* para);
-  typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
-      GetTraverseCallback;
-  void traverse(GetTraverseCallback getTraverseCallback);
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/Trainer.cpp b/paddle/legacy/trainer/Trainer.cpp
deleted file mode 100644
index 2db754793..000000000
--- a/paddle/legacy/trainer/Trainer.cpp
+++ /dev/null
@@ -1,653 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Trainer.h"
-
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/legacy/utils/Common.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "RemoteParameterUpdater.h"
-#include "TesterConfig.h"
-#include "ThreadParameterUpdater.h"
-#include "TrainerConfigHelper.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/ValidationLayer.h"
-
-DEFINE_string(config, "", "Trainer config file");
-
-DEFINE_int32(test_period,
-             0,
-             "if equal 0, do test on all test data at the end of "
-             "each pass. While if equal non-zero, do test on all test "
-             "data every test_period batches");
-DEFINE_bool(test_all_data_in_one_period,
-            false,
-            "This option was deprecated, since we will always do "
-            "test on all test set ");
-
-DEFINE_bool(local, true, "Train in local mode or not");
-
-DEFINE_int32(average_test_period,
-             0,
-             "Do test on average parameter every so"
-             " many batches. MUST be devided by FLAGS_log_period."
-             " Default 0 means do not test average parameter");
-
-DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-DEFINE_int64(saving_period_by_batches,
-             0,
-             "Save parameters every so many batches in one pass");
-DEFINE_string(save_dir, "", "Directory for saving model parameter");
-DEFINE_int32(start_pass,
-             0,
-             "Start training from this pass. "
-             "Will load parameter from the previous pass");
-DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
-DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
-DEFINE_bool(with_cost, true, "enable cost layer or not");
-DEFINE_bool(distribute_test, false, "test in distribute mode");
-
-DEFINE_int32(num_passes, 100, "train for so many passes");
-
-DEFINE_string(config_args,
-              "",
-              "arguments passed to config file."
-              "Format: key1=value1,key2=value2");
-
-DEFINE_bool(save_only_one,
-            false,
-            "Save only parameters in last pass, remove previous.");
-
-DEFINE_string(feat_file, "", "File name of extracted feature.");
-DEFINE_string(predict_output_dir,
-              "",
-              "Directory that saves the predicted results of output layers");
-DEFINE_string(model_list, "", "File that saves the model list when evaluation");
-
-namespace paddle {
-
-void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
-                   bool testing,
-                   const std::shared_ptr<GradientMachine>& gradientMachine,
-                   const std::shared_ptr<DataProvider>& dataProvider,
-                   const std::shared_ptr<DataProvider>& testDataProvider) {
-  this->stats_ = std::make_shared<TrainerStats>();
-
-  config_ = config;
-
-  config_->updateConfigFromFlags();
-
-  testing_ = testing;
-
-  // in testing, mode_ may GradientMachine::kTesting or
-  // GradientMachine::kSgdSparseCpuTraining
-
-  if (FLAGS_local) {
-    CHECK(!FLAGS_loadsave_parameters_in_pserver)
-        << "local and loadsave_parameters_in_pserver can not both true";
-    if (config_->getOptConfig().use_sparse_remote_updater()) {
-      config_->disableRemoteSparseUpdaterForEachParams();
-      LOG(INFO) << "ignore sparse_remote_update=true due to  --local=true";
-    }
-  }
-  if (FLAGS_loadsave_parameters_in_pserver) {
-    CHECK(config_->getOptConfig().use_sparse_remote_updater())
-        << "no parameter to load from pserver, please check network config";
-  }
-  if (testing && !FLAGS_loadsave_parameters_in_pserver) {
-    if (config_->getOptConfig().use_sparse_remote_updater()) {
-      config_->disableRemoteSparseUpdater();
-      LOG(INFO) << "because parameter is loaded local,"
-                << "tester ignore sparse_remote_update flag";
-    }
-  }
-
-  CHECK(TrainAlgorithm::isValid(config_->getOptConfig().algorithm()))
-      << "invalid algorithm configuration: "
-      << config_->getOptConfig().algorithm();
-
-  bool useSparseUpdater = false;
-  for (auto& paraConfig : config_->getModelConfig().parameters()) {
-    if (paraConfig.sparse_update() || paraConfig.sparse_remote_update()) {
-      useSparseUpdater = true;
-    }
-  }
-
-  if (FLAGS_use_mkldnn) {
-    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
-  }
-
-  if (testing) {
-    LOG(INFO) << "trainer: in testing mode";
-    if (config_->getOptConfig().use_sparse_remote_updater() ||
-        FLAGS_trainer_count > 1) {
-      mode_ = GradientMachine::kSgdSparseCpuTraining;
-      LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
-    } else {
-      mode_ = GradientMachine::kTesting;
-      LOG(INFO) << "trainer mode: Testing";
-    }
-  } else if (IGradientMachineMode::tryGetMode(
-                 (int*)&mode_,
-                 config_->getOptConfig().algorithm(),
-                 FLAGS_trainer_count,
-                 FLAGS_local,
-                 FLAGS_use_gpu)) {
-    LOG(INFO) << "Custom trainer mode.";
-  } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
-              config_->getOptConfig().algorithm() ==
-                  TrainAlgorithm::AsyncSGD) &&
-             useSparseUpdater) {
-    mode_ = GradientMachine::kSgdSparseCpuTraining;
-    LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
-  } else {
-    mode_ = GradientMachine::kNormal;
-    LOG(INFO) << "trainer mode: Normal";
-  }
-
-  // initialize trainer internal
-  trainerInternal_.init(config_,
-                        gradientMachine,
-                        TrainerInternalConfig::createFromMode(mode_),
-                        stats_,
-                        testing);
-  std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(FLAGS_save_only_one,
-                              FLAGS_saving_period,
-                              FLAGS_loadsave_parameters_in_pserver,
-                              FLAGS_config));
-
-  paramUtil_.reset(
-      new paddle::ParameterUtil(config_,
-                                std::move(paramConfig),
-                                trainerInternal_.getGradientMachine(),
-                                trainerInternal_.getParameterUpdater()));
-
-  bool gpuData =
-      FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
-      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
-
-  dataProvider_ = dataProvider;
-  if (!dataProvider_ && config_->hasDataConfig() && !testing_) {
-    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
-  }
-  if (!testDataProvider_) {
-    // No evaluator_ if there is testDataProvider but no dataProvider.
-    evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
-    currentEvaluator_.reset(
-        trainerInternal_.getGradientMachine()->makeEvaluator());
-    if (FLAGS_average_test_period > 0 && FLAGS_trainer_id == 0 &&
-        config_->getOptConfig().average_window() > 0) {
-      CHECK_EQ(FLAGS_average_test_period % FLAGS_log_period, 0)
-          << "FLAGS_average_test_period must be divided by FALGS_log_period";
-      averageEvaluator_.reset(
-          trainerInternal_.getGradientMachine()->makeEvaluator());
-    }
-  }
-
-  testDataProvider_ = testDataProvider;
-  if (!testDataProvider_ && config_->hasTestDataConfig()) {
-    testDataProvider_.reset(
-        DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
-  }
-  if (testDataProvider_) {
-    createTester();
-  }
-
-  if (!testing &&
-      (trainerInternal_.getGradientMachine()->hasStaticParameters())) {
-    CHECK(!FLAGS_loadsave_parameters_in_pserver)
-        << "is_static and loadsave_parameters_in_pserver can not both true";
-  }
-  if (testing) {
-    // will load per pass for tester
-  } else if (paramUtil_->tryLoadParametersFromConfig()) {
-    // load from config already.
-  } else {
-    trainerInternal_.getGradientMachine()->randParameters();
-  }
-
-  // Only non static parameters need to be updated
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  if (trainerInternal_.getParameterUpdater()) {
-    trainerInternal_.getParameterUpdater()->init(parameters);
-
-    if (FLAGS_loadsave_parameters_in_pserver && FLAGS_trainer_id == 0) {
-      if (testing) {
-        // will load per pass for tester
-      } else if (!config_->getConfig().init_model_path().empty() &&
-                 (FLAGS_local || FLAGS_trainer_id == 0)) {
-        paramUtil_->loadParametersWithPath(
-            config_->getConfig().init_model_path(),
-            false /*local*/,
-            true /*remote*/);
-      } else if (config_->getConfig().start_pass() > 0 &&
-                 (FLAGS_local || FLAGS_trainer_id == 0)) {
-        CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
-                                         false /*local*/,
-                                         true /*remote*/));
-      } else {
-        trainerInternal_.getParameterUpdater()->randParametersRemote();
-      }
-    }
-  }
-
-  // set current evaluator and evalutor
-  trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
-  trainerInternal_.setEvaluator(evaluator_.get());
-}
-
-void Trainer::train(size_t numPasses) {
-  startTrain();
-  for (size_t i = 0; i < numPasses; ++i) {
-    if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
-      trainOnePassBatch(config_->getConfig().start_pass() + i);
-    } else {
-      trainOnePass();
-    }
-    if (i < numPasses - 1) {
-      dataProvider_->reset();
-    }
-  }
-
-  finishTrain();
-}
-
-static double genPerturbation(real* d, real* grad, size_t dim) {
-  auto& reng = ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<double> dist(-1, 1);
-  double gradNorm = 0, dNorm = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    d[i] = dist(reng);
-    dNorm += d[i] * d[i];
-    gradNorm += grad[i] * grad[i];
-  }
-  if (gradNorm > 0) {
-    real s = 0.5 * sqrt(gradNorm / dNorm);
-    for (size_t i = 0; i < dim; ++i) {
-      d[i] = s * d[i] + grad[i];
-    }
-  }
-  double delta = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    delta += grad[i] * d[i];
-  }
-  return delta;
-}
-
-real Trainer::checkGradient() {
-  trainerInternal_.getGradientMachine()->start();
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-
-  dataProvider_->getNextBatch(batchSize, &dataBatch);
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  std::vector<Argument>& inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
-
-  trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-  real cost = Argument::sum(outArgs);
-  LOG(INFO) << "original cost=" << cost;
-  trainerInternal_.getGradientMachine()->backward();
-
-  real maxDiff = 0;
-  char fill = ' ';
-  for (auto& parameter : parameters) {
-    CpuVector oldPara(parameter->getSize());
-    CpuVector newPara(parameter->getSize());
-    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    real* newp = newPara.getData();
-    real* oldp = oldPara.getData();
-    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
-    real* grad = cpuGrad.getData();
-    size_t dim = parameter->getSize();
-    std::vector<real> d(dim);
-
-    double delta = genPerturbation(d.data(), grad, dim);
-
-    // use a step such that delta / cost is FLAGS_checkgrad_eps
-    real step =
-        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
-    delta *= step;
-    for (size_t i = 0; i < dim; ++i) {
-      newp[i] = oldp[i] + step * d[i];
-    }
-
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-    parameter->setValueUpdated();
-    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost1 = Argument::sum(outArgs);
-
-    for (size_t i = 0; i < dim; ++i) {
-      newp[i] = oldp[i] - step * d[i];
-    }
-
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-    parameter->setValueUpdated();
-    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost2 = Argument::sum(outArgs);
-
-    real trueDelta = 0.5 * (newCost1 - newCost2);
-    real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(fill)
-              << std::setw(20) << parameter->getName()
-              << "step=" << std::setw(15) << step << "cost1=" << std::setw(10)
-              << newCost1 << "cost2=" << std::setw(10) << newCost2
-              << "true_delta=" << std::setw(15) << trueDelta
-              << "analytic_delta=" << std::setw(15) << delta << "diff=" << diff
-              << (std::abs(diff) > 0.01 ? " ***" : "");
-
-    maxDiff = std::max(maxDiff, std::abs(diff));
-
-    // restore parameter
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
-    parameter->setValueUpdated();
-
-    fill = (fill == ' ') ? '.' : ' ';
-  }
-  return maxDiff;
-}
-
-void Trainer::startTrain() {
-  trainPassContext_.passId = config_->getConfig().start_pass();
-  srand(config_->getConfig().start_pass() + 1);
-  if (dataProvider_) {
-    dataProvider_->reset();
-  }
-
-  trainerInternal_.getGradientMachine()->start();
-}
-
-void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
-
-void Trainer::startTrainPass() {
-  stats_->reset();
-  trainPassContext_.batchId = 0;
-  trainPassContext_.avgTestCost = 0;
-  trainPassContext_.numAvgTests = 0;
-  trainPassContext_.passInnerId = 1;
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  evaluator_->start();
-  if (FLAGS_prev_batch_state) {
-    trainerInternal_.getGradientMachine()->resetState();
-    trainerInternal_.getGradientMachine()->getState(testState_);
-  }
-}
-
-void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
-  int num = dataBatch.getSize();
-  if (averageEvaluator_) {
-    int64_t mod = trainPassContext_.batchId % FLAGS_average_test_period;
-    if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
-      if (mod == FLAGS_average_test_period - FLAGS_log_period) {
-        averageEvaluator_->start();
-      }
-      trainerInternal_.getParameterUpdater()->apply();
-      if (FLAGS_prev_batch_state) {
-        trainerInternal_.getGradientMachine()->getState(trainState_);
-      }
-      trainPassContext_.avgTestCost += tester_->forwardOneBatch(
-          dataBatch, averageEvaluator_.get(), &forwardOutput_);
-      if (FLAGS_prev_batch_state) {
-        trainerInternal_.getGradientMachine()->setState(trainState_);
-      }
-      trainPassContext_.numAvgTests += num;
-      trainerInternal_.getParameterUpdater()->restore();
-    }
-  }
-  {
-    REGISTER_TIMER("TrainBatch");
-    trainerInternal_.trainOneBatch(
-        trainPassContext_.batchId, dataBatch, &forwardOutput_);
-  }
-
-  if (averageEvaluator_ &&
-      trainPassContext_.batchId % FLAGS_average_test_period ==
-          FLAGS_average_test_period - 1) {
-    averageEvaluator_->finish();
-    LOG(INFO) << " Averaged parameter:"
-              << " cost="
-              << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests
-              << " Eval: " << *averageEvaluator_;
-    trainPassContext_.numAvgTests = 0;
-    trainPassContext_.avgTestCost = 0;
-  }
-
-  ++trainPassContext_.batchId;
-
-  if (trainPassContext_.batchId % FLAGS_log_period == 0) {
-    FOR_TIMING(globalStat.setThreadInfo(true));
-    FOR_TIMING(globalStat.printAllStatus());
-    FOR_TIMING(globalStat.reset());
-  }
-
-  if (testDataProvider_ && FLAGS_test_period > 0 &&
-      trainPassContext_.batchId % FLAGS_test_period == 0) {
-    tester_->testOnePeriod();
-  }
-
-  if (FLAGS_saving_period_by_batches > 0 &&
-      trainPassContext_.batchId >
-          FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
-      0 == FLAGS_trainer_id) {
-    trainerInternal_.getParameterUpdater()->catchUpWith();
-    if (testDataProvider_) {
-      tester_->testOnePeriod();
-    }
-    paramUtil_->saveParametersOnePass(trainPassContext_.passId,
-                                      trainPassContext_.passInnerId);
-    ++trainPassContext_.passInnerId;
-  }
-}
-
-void Trainer::finishTrainPass() {
-  if (trainPassContext_.batchId == 0) {
-    // This means no more data from DataProvider
-    return;
-  }
-
-  trainerInternal_.finishTrainPass(trainPassContext_.passId,
-                                   trainPassContext_.batchId);
-
-  FOR_TIMING(globalStat.setThreadInfo(true));
-  FOR_TIMING(globalStat.printAllStatus());
-  FOR_TIMING(globalStat.reset());
-
-  if (testDataProvider_) {
-    tester_->testOnePeriod();
-  }
-
-  if (trainPassContext_.passId % FLAGS_saving_period == 0 &&
-      FLAGS_trainer_id == 0) {
-    paramUtil_->saveParametersOnePass(trainPassContext_.passId);
-  }
-  ++trainPassContext_.passId;
-}
-
-void Trainer::trainOnePass() {
-  startTrainPass();
-  size_t batchSize = config_->getOptConfig().batch_size();
-  while (true) {
-    DataBatch dataBatch;
-
-    int num = 0;
-    {
-      REGISTER_TIMER("getTrainBatch");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-    if (num == 0) break;
-    CHECK_EQ(num, dataBatch.getSize());
-    trainOneDataBatch(dataBatch);
-  }
-
-  finishTrainPass();
-}
-
-void Trainer::trainOnePassBatch(int passId) {
-  this->stats_->reset();
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  const std::vector<Argument> inArgs;
-  {
-    REGISTER_TIMER("onePass");
-    trainerInternal_.getGradientMachine()->forwardBackward(
-        inArgs, nullptr, PASS_TRAIN, nullptr);
-  }
-
-  real cost = .0;
-  int64_t num = 0;
-  trainerInternal_.getGradientMachine()->getStats(cost, num);
-  *stats_ += {num, cost};
-
-  trainerInternal_.getGradientMachine()->onPassEnd();
-
-  bool accepted = trainerInternal_.getParameterUpdater()->finishPass();
-
-  globalStat.setThreadInfo(true);
-  globalStat.printAllStatus();
-  globalStat.reset();
-
-  LOG(INFO) << " Pass=" << passId
-            << " AcceptedPass=" << (accepted ? acceptedPassId_ : -1)
-            << stats_->getStats(false /*withCurrentCost*/);
-
-  if (accepted) {
-    if (acceptedPassId_ % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
-      paramUtil_->saveParameters(acceptedPassId_);
-    }
-    acceptedPassId_++;
-    if (FLAGS_save_only_one && acceptedPassId_ >= FLAGS_saving_period) {
-      paramUtil_->deleteParameters(acceptedPassId_ - FLAGS_saving_period);
-    }
-  }
-}
-
-real Trainer::calcGradient(const DataBatch& dataBatch,
-                           const Vector& value,
-                           Vector& gradient) {
-  CHECK_EQ(value.getSize(), gradient.getSize());
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getParameters();
-
-  clearGradient();
-
-  size_t offset = 0;
-  size_t valueSize = value.getSize();
-
-  for (auto& para : parameters) {
-    CHECK_LE(offset + para->getSize(), valueSize);
-    VectorPtr val =
-        Vector::create(para->getSize(), value.getMemoryHandle(), offset);
-    para->getBuf(PARAMETER_VALUE)->copyFrom(*val);
-    para->setValueUpdated();
-    offset += para->getSize();
-  }
-
-  CHECK_EQ(offset, valueSize);
-
-  std::vector<Argument> inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
-
-  trainerInternal_.getGradientMachine()->forwardBackward(
-      inArgs, &outArgs, PASS_TRAIN);
-  real cost = Argument::sum(outArgs);
-
-  offset = 0;
-  for (auto& para : parameters) {
-    VectorPtr grad =
-        Vector::create(para->getSize(), gradient.getMemoryHandle(), offset);
-    if (para->getBuf(PARAMETER_GRADIENT)) {
-      grad->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    }
-    offset += para->getSize();
-  }
-
-  return cost;
-}
-
-void Trainer::clearGradient() {
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  for (auto& parameter : parameters) {
-    parameter->clearGradient();
-  }
-}
-
-int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
-
-void Trainer::createTester() {
-  tester_.reset(new paddle::Tester(config_,
-                                   createTesterConfig(),
-                                   trainerInternal_.getGradientMachine(),
-                                   trainerInternal_.getParameterUpdater(),
-                                   testDataProvider_));
-}
-
-void Trainer::test() { tester_->test(); }
-
-std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
-  TesterConfig* conf = new TesterConfig;
-  if (FLAGS_test_period) {
-    LOG(WARNING) << "The meaning of --test_period is changed: "
-                 << "if equal 0, do test on all test data at the end of "
-                 << "each pass. While if equal non-zero, do test on all test "
-                 << "data every test_period batches ";
-  }
-  if (FLAGS_test_all_data_in_one_period) {
-    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
-                 << "we will always do test on all test set ";
-  }
-  conf->testPeriod = FLAGS_test_period;
-  conf->prevBatchState = FLAGS_prev_batch_state;
-  conf->logPeriod = FLAGS_log_period;
-  conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;
-  conf->featFile = FLAGS_feat_file;
-  conf->predictOutputDir = FLAGS_predict_output_dir;
-  conf->trainerId = FLAGS_trainer_id;
-  conf->distributeTest = FLAGS_distribute_test;
-  conf->config = FLAGS_config;
-  conf->modelList = FLAGS_model_list;
-  conf->testPass = FLAGS_test_pass;
-  conf->numPasses = FLAGS_num_passes;
-  conf->savingPeriod = FLAGS_saving_period;
-  conf->testWait = FLAGS_test_wait;
-  conf->initModelPath = FLAGS_init_model_path;
-  conf->saveOnlyOne = FLAGS_save_only_one;
-  conf->testing = testing_;
-  conf->mode = mode_;
-  conf->trainState = &trainState_;
-  conf->testState = &testState_;
-  return std::unique_ptr<TesterConfig>(conf);
-}
-
-ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); }
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/Trainer.h b/paddle/legacy/trainer/Trainer.h
deleted file mode 100644
index b467f9af0..000000000
--- a/paddle/legacy/trainer/Trainer.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParamUtil.h"
-#include "ParameterUpdater.h"
-#include "Tester.h"
-#include "TrainerConfigHelper.h"
-#include "TrainerInternal.h"
-
-DECLARE_int32(num_passes);
-
-namespace paddle {
-
-/**
- * Trainer Class
- *
- * Trainer combines GradientMachine, ParameterUpdater, DataProvider together to
- * train/test a NeuralNetwork.
- */
-class Trainer {
- public:
-  /**
-   * Ctor.
-   * @return
-   */
-  Trainer() : acceptedPassId_(0) {}
-
-  virtual ~Trainer() {}
-
-  /**
-   * initialize a new trainer using config
-   *
-   * @param config TrainerConfig.
-   * @param testing true if only for testing
-   * @param gradientMachine GradientMachine that will be trained.
-   *                        nullptr if create from config.
-   * @param dataProvider Train Data Provider. null if create from config.
-   * @param testDataProvider Test Data Provider. null if create from config.
-   */
-  virtual void init(
-      const std::shared_ptr<TrainerConfigHelper>& config,
-      bool testing = false,
-      const std::shared_ptr<GradientMachine>& gradientMachine = nullptr,
-      const std::shared_ptr<DataProvider>& dataProvider = nullptr,
-      const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
-
-  /**
-   * Train until num_passes reached.
-   * One pass means neural network train through all training data.
-   *
-   * @param numPasses the number of traning pass.
-   * @note Durning neural network training, the num passes may set a very large
-   * value, and kill training process when result is good enough.
-   */
-  void train(size_t numPasses = (size_t)FLAGS_num_passes);
-
-  /**
-   * compare the gradient from bp with finite difference
-   * @return  the maximal difference
-   */
-  real checkGradient();
-
-  void startTrain();
-  void finishTrain();
-  void startTrainPass();
-  void finishTrainPass();
-  void trainOneDataBatch(DataBatch& dataBatch);
-  void time();
-
-  /**
-   * given a dataBatch and the current parameter value
-   * calculate its gradient and return the cost.
-   *
-   * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
-   * removed?
-   */
-  real calcGradient(const DataBatch& dataBatch,
-                    const Vector& value,
-                    Vector& gradient);
-
-  /**
-   * Get Trainer Config.
-   */
-  const TrainerConfig& getConfig() const { return config_->getConfig(); }
-
-  /**
-   * Get Train Data Provider
-   */
-  const DataProviderPtr& getDataProvider() { return dataProvider_; }
-
-  /**
-   * Get Gradient Machine.
-   */
-  const GradientMachinePtr& getGradientMachine() {
-    return trainerInternal_.getGradientMachine();
-  }
-
-  /**
-   * Get batch size in optimization config.
-   * @note This method didn't return the actual batch size. Just batch size
-   * set in the optimization config. The actual batch size in one trainer may
-   * less than batch size in config due to there are not enough data.
-   */
-  int getBatchSize();
-
-  /**
-   * Do test job
-   */
-  void test();
-
-  /**
-   * Get parameter util ptr
-   *
-   * TODO(yuyang18): Make it return a smart pointer.
-   */
-  ParameterUtil* getParameterUtilPtr();
-
- protected:
-  /**
-   * Train one pass of data.
-   *
-   * SGD Method.
-   */
-  void trainOnePass();
-
-  /**
-   * Train one pass in one batch.
-   *
-   */
-  void trainOnePassBatch(int passId);
-
-  /**
-   * set parameter gradient to zero
-   */
-  void clearGradient();
-
-  void createTester();
-
- private:
-  std::unique_ptr<TesterConfig> createTesterConfig();
-
- protected:
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::shared_ptr<TrainerStats> stats_;
-
-  DataProviderPtr dataProvider_;
-  DataProviderPtr testDataProvider_;
-  MachineState trainState_;
-  MachineState testState_;
-
-  struct TrainPassContext {
-    int64_t batchId;
-    real avgTestCost;
-    int64_t numAvgTests;
-    int passId;
-    int passInnerId;
-  };
-  std::vector<paddle::Argument> forwardOutput_;
-
-  TrainPassContext trainPassContext_;
-
-  std::unique_ptr<Evaluator> evaluator_;
-  std::unique_ptr<Evaluator> currentEvaluator_;
-  std::unique_ptr<Evaluator> averageEvaluator_;
-  // training mode
-  // used to decide which GradientMachine and ParameterUpdater to create
-  GradientMachine::CreateMode mode_;
-  int testing_;
-  int acceptedPassId_;
-
-  // trainer tester
-  std::unique_ptr<Tester> tester_;
-
-  // parameter util
-  std::unique_ptr<ParameterUtil> paramUtil_;
-
-  // trainer Internal
-  TrainerInternal trainerInternal_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerBenchmark.cpp b/paddle/legacy/trainer/TrainerBenchmark.cpp
deleted file mode 100644
index 7f5bd2335..000000000
--- a/paddle/legacy/trainer/TrainerBenchmark.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#undef PADDLE_DISABLE_TIMER
-
-#include "Trainer.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-DECLARE_int32(test_period);
-
-DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
-
-namespace paddle {
-
-void Trainer::time() {
-  startTrain();
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  evaluator_->start();
-
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
-                           << num << " != " << batchSize;
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-
-  std::vector<paddle::Argument> outputs;
-  // burning time
-  LOG(INFO) << "Burning time...";
-  for (int n = 0; n < 10; ++n) {
-    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
-  }
-  LOG(INFO) << "Burning time end.";
-
-  for (int n = 0; n < FLAGS_test_period; n++) {
-    if (FLAGS_feed_data) {
-      REGISTER_TIMER("GetData");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-
-    if (num != batchSize) {
-      break;
-    }
-
-    {
-      REGISTER_TIMER("FwdBwd");
-      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
-    }
-  }
-  globalStat.setThreadInfo(true);
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-
-  finishTrain();
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerConfigHelper.cpp b/paddle/legacy/trainer/TrainerConfigHelper.cpp
deleted file mode 100644
index 4d31ba8d7..000000000
--- a/paddle/legacy/trainer/TrainerConfigHelper.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerConfigHelper.h"
-#include "ParamUtil.h"
-#include "TrainerConfig.pb.h"
-#include "paddle/legacy/utils/Flags.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_int32(start_pass);
-DECLARE_string(save_dir);
-DECLARE_int32(trainer_id);
-DECLARE_bool(local);
-DECLARE_bool(with_cost);
-DECLARE_bool(with_gpu);
-DECLARE_bool(parallel_nn);
-DECLARE_string(config_args);
-DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkl_packed);
-
-const char *kConfigParserModuleName = "paddle.trainer.config_parser";
-const char *kConfigParserFuncName = "parse_config_and_serialize";
-
-namespace paddle {
-
-struct TrainerConfigHelperPrivate {
-  TrainerConfig conf;
-};
-
-TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
-    : m(new TrainerConfigHelperPrivate()) {
-  std::ostringstream configArgs;
-  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
-             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
-             << ",parallel_nn=" << FLAGS_parallel_nn
-             << ",use_mkldnn=" << FLAGS_use_mkldnn
-             << ",use_mkl_packed=" << FLAGS_use_mkl_packed
-             << ",cudnn_version=" << hl_get_cudnn_lib_version();
-  if (!FLAGS_config_args.empty()) {
-    configArgs << "," << FLAGS_config_args;
-  }
-
-  VLOG(3) << "Parsing trainer config " << configFilePath;
-  std::string configProtoStr =
-      callPythonFunc(kConfigParserModuleName,
-                     kConfigParserFuncName,
-                     {configFilePath, configArgs.str()});
-  CHECK(m->conf.ParseFromString(configProtoStr));
-}
-
-TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
-    : m(new TrainerConfigHelperPrivate()) {
-  m->conf = config;
-}
-
-TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
-
-const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
-
-TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; }
-
-const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
-  return m->conf.opt_config();
-}
-
-const ModelConfig &TrainerConfigHelper::getModelConfig() const {
-  return m->conf.model_config();
-}
-
-const DataConfig *TrainerConfigHelper::getDataConfigPtr() const {
-  if (m->conf.has_data_config()) {
-    return &m->conf.data_config();
-  } else {
-    return nullptr;
-  }
-}
-
-const DataConfig &TrainerConfigHelper::getTestDataConfig() const {
-  CHECK(m->conf.has_test_data_config());
-  return m->conf.test_data_config();
-}
-
-bool TrainerConfigHelper::hasDataConfig() const {
-  return m->conf.has_data_config();
-}
-
-bool TrainerConfigHelper::hasTestDataConfig() const {
-  return m->conf.has_test_data_config();
-}
-
-void TrainerConfigHelper::updateConfigFromFlags() {
-  if (!FLAGS_save_dir.empty()) {
-    m->conf.set_save_dir(FLAGS_save_dir);
-  }
-  if (!FLAGS_init_model_path.empty()) {
-    m->conf.set_init_model_path(FLAGS_init_model_path);
-  }
-  if (FLAGS_start_pass != 0) {
-    m->conf.set_start_pass(FLAGS_start_pass);
-  }
-}
-
-void TrainerConfigHelper::disableRemoteSparseUpdater() {
-  m->conf.mutable_opt_config()->set_use_sparse_remote_updater(false);
-}
-
-void TrainerConfigHelper::disableRemoteSparseUpdaterForEachParams() {
-  this->disableRemoteSparseUpdater();
-  for (int i = 0; i < m->conf.model_config().parameters_size(); ++i) {
-    m->conf.mutable_model_config()
-        ->mutable_parameters(i)
-        ->set_sparse_remote_update(false);
-  }
-}
-
-OptimizationConfig &TrainerConfigHelper::getOptConfig() {
-  return *m->conf.mutable_opt_config();
-}
-
-void TrainerConfigHelper::setSaveDir(const std::string &saveDir) {
-  m->conf.set_save_dir(saveDir);
-}
-
-const std::string &TrainerConfigHelper::getSaveDir() const {
-  return m->conf.save_dir();
-}
-
-std::string TrainerConfigHelper::getConfigNameFromPath(
-    const std::string &modelPath) {
-  std::ifstream s(path::join(modelPath, "path.txt"));
-  CHECK(s.is_open()) << " fail to open path.txt";
-  std::string ss;
-  getline(s, ss);
-  VLOG(3) << "fileName " << path::join(modelPath, ss);
-  s.close();
-  return path::join(modelPath, ss);
-}
-
-std::string TrainerConfigHelper::getConfigNameFromPassId(
-    int passId, const std::string &modelPath) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "pass-%05d", passId);
-  return TrainerConfigHelper::getConfigNameFromPath(path::join(modelPath, buf));
-}
-
-std::string TrainerConfigHelper::getConfigName(bool *ok) const {
-  std::string retv = "";
-
-  if (!m->conf.config_file().empty()) {
-    retv = m->conf.config_file();
-  } else if (!m->conf.init_model_path().empty()) {
-    retv = getConfigNameFromPath(m->conf.init_model_path());
-  } else if (m->conf.start_pass() >= 1) {
-    retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir());
-  }
-
-  if (ok) {
-    *ok = !retv.empty();
-  }
-
-  return retv;
-}
-
-std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
-  std::string configPath;
-  if (!FLAGS_config.empty()) {
-    configPath = FLAGS_config;
-  } else if (!FLAGS_init_model_path.empty()) {
-    configPath = getConfigNameFromPath(FLAGS_init_model_path);
-  } else if (FLAGS_start_pass >= 1) {
-    configPath =
-        getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path);
-  } else {
-    return nullptr;
-  }
-  return std::make_shared<TrainerConfigHelper>(configPath);
-}
-
-std::shared_ptr<TrainerConfigHelper>
-TrainerConfigHelper::createFromFlagConfig() {
-  CHECK(!FLAGS_config.empty());
-  return std::make_shared<TrainerConfigHelper>(FLAGS_config);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerConfigHelper.h b/paddle/legacy/trainer/TrainerConfigHelper.h
deleted file mode 100644
index 0e428bea2..000000000
--- a/paddle/legacy/trainer/TrainerConfigHelper.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <paddle/legacy/utils/Logging.h>
-#include <paddle/legacy/utils/Util.h>
-#include <memory>
-
-namespace paddle {
-
-class TrainerConfig;
-class OptimizationConfig;
-struct TrainerConfigHelperPrivate;
-class ModelConfig;
-class DataConfig;
-
-/**
- * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
- * simplize the usage for TrainerConfig.
- *
- * The all operation to TrainerConfig object should use this object. It remove
- * many copy & paste code in trainer.
- *
- * @TODO(yuyang18): Make cmake check compiler support keyword 'final' or not.
- * Define a macro to unify 'final' keyword
- */
-class TrainerConfigHelper /*final*/ {
- public:
-  DISABLE_COPY(TrainerConfigHelper);
-
-  /**
-   * @brief Ctor, Create a TrainerConfig from config file
-   * @param configFilePath Config file path.
-   */
-  explicit TrainerConfigHelper(const std::string& configFilePath);
-  explicit TrainerConfigHelper(const TrainerConfig& config);
-
-  /**
-   * Dtor
-   * @warning this class is a final class. Should not be inherited.
-   */
-  ~TrainerConfigHelper();
-
-  /**
-   * @brief Get Trainer Config itself.
-   */
-  const TrainerConfig& getConfig() const;
-
-  TrainerConfig& getMutableConfig();
-
-  /**
-   * @brief Get Optimizer Config.
-   */
-  const OptimizationConfig& getOptConfig() const;
-
-  /**
-   * @brief Get Model Config.
-   */
-  const ModelConfig& getModelConfig() const;
-
-  /**
-   * @brief Get Train Data Config Pointer.
-   * @return nullptr if there is no train data. Else will return pointer
-   */
-  const DataConfig* getDataConfigPtr() const;
-
-  /**
-   * @brief Get Tain Data Config.
-   * @warning Core when there is no train data.
-   */
-  const DataConfig& getDataConfig() const {
-    CHECK(this->hasDataConfig());
-    auto conf = this->getDataConfigPtr();
-    return *conf;
-  }
-
-  /**
-   * @brief Get test data config
-   * @warning Core when there is no test data.
-   */
-  const DataConfig& getTestDataConfig() const;
-
-  /**
-   * @brief Has train data config or not.
-   * @return true if has train data.
-   */
-  bool hasDataConfig() const;
-
-  /**
-   * @brief Has test data config or not.
-   * @return true if has test data.
-   */
-  bool hasTestDataConfig() const;
-
-  /**
-   * @brief Update trainer config from command line flags.
-   *        Override config's (save_dir, init_model_path, start_pass) if command
-   *        flags is existed.
-   */
-  void updateConfigFromFlags();
-
-  /**
-   * @brief Disable optimization's sparse remote update.
-   */
-  void disableRemoteSparseUpdater();
-
-  /**
-   * @brief Disable optimization and each parameter's sparse remote update.
-   */
-  void disableRemoteSparseUpdaterForEachParams();
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const TrainerConfig&() const { return this->getConfig(); }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const OptimizationConfig&() const {
-    return this->getOptConfig();
-  }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const DataConfig&() const { return this->getDataConfig(); }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const ModelConfig&() const { return this->getModelConfig(); }
-
-  /**
-   * @brief Get mutable optimization config.
-   */
-  OptimizationConfig& getOptConfig();
-
-  /**
-   * @brief set model save directory.
-   * @param saveDir Directory path.
-   */
-  void setSaveDir(const std::string& saveDir);
-
-  /**
-   * @brief get model save directory.
-   * @return save directory path.
-   */
-  const std::string& getSaveDir() const;
-
-  /**
-   * @brief Get config file name from model path.
-   *
-   * Paddle save model to a directory, and write a file 'path.txt' which save
-   * config filename.
-   *
-   * @param modelPath model saved directory.
-   * @return config file name.
-   */
-  static std::string getConfigNameFromPath(const std::string& modelPath);
-
-  /**
-   * @brief Get config file name from this config instance.
-   * @param[out] ok true if no error.
-   * @return config file name.
-   */
-  std::string getConfigName(bool* ok = nullptr) const;
-
-  /**
-   * @brief Try to create TrainerConfigHelper from all command line flags.
-   *        Try to load from --config, --init_model_path, --start_pass one by
-   *        one. Return nullptr if cannot load TrainerConfigHelper from all
-   *        these place.
-   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
-   */
-  static std::shared_ptr<TrainerConfigHelper> createFromFlags();
-
-  /**
-   * @brief Try to create TrainerConfigHelper only from '--config' flag.
-   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
-   */
-  static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
-
- private:
-  static std::string getConfigNameFromPassId(int passId,
-                                             const std::string& modelPath);
-
-  TrainerConfigHelperPrivate* m;
-};
-
-typedef std::shared_ptr<TrainerConfigHelper> TrainerConfigHelperPtr;
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternal.cpp b/paddle/legacy/trainer/TrainerInternal.cpp
deleted file mode 100644
index ee3dea634..000000000
--- a/paddle/legacy/trainer/TrainerInternal.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerInternal.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/legacy/gserver/layers/ValidationLayer.h"
-#include "paddle/legacy/utils/GlobalConstants.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-#include "paddle/legacy/utils/Stat.h"
-#include "paddle/legacy/utils/Util.h"
-
-#include "RemoteParameterUpdater.h"
-#include "ThreadParameterUpdater.h"
-
-namespace paddle {
-
-void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper>& config,
-                           const GradientMachinePtr& gradientMachine,
-                           std::unique_ptr<TrainerInternalConfig>&& intconfig,
-                           const std::shared_ptr<TrainerStats>& stats,
-                           bool testing) {
-  config_ = config;
-  intconfig_ = std::move(intconfig);
-  stats_ = stats;
-
-  //! in training will use parameter updater definitly.
-  //! But only use parameter in testing mode when some parameter in pserver.
-  if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
-                   intconfig_->loadsave_parameters_in_pserver)) {
-    createParameterUpdater(testing);
-  }
-
-  gradientMachine_ = gradientMachine;
-  if (!gradientMachine) {
-    CHECK(config_->getConfig().has_model_config())
-        << "Missing model_config in trainer_config";
-    gradientMachine_.reset(
-        GradientMachine::create(config_->getConfig().model_config(),
-                                intconfig_->mode,
-                                parameterUpdater_->getParameterTypes()));
-  }
-}
-
-void TrainerInternal::trainOneBatch(int64_t batchId,
-                                    const DataBatch& dataBatch,
-                                    std::vector<Argument>* outArgs) {
-  // true means updating parameter whenever gradient is ready during backward()
-  bool doPipelineUpdate =
-      (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
-      (intconfig_->local || intconfig_->use_gpu ||
-       intconfig_->trainer_count <= 1);
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return;
-  }
-
-  bool showStats = intconfig_->show_param_stats_period > 0 &&
-                   (batchId + 1) % intconfig_->show_param_stats_period == 0 &&
-                   intconfig_->trainer_id == 0;
-
-  std::vector<ParaStat> paraStats;
-  if (showStats) {
-    paraStats.resize(gradientMachine_->getParameters().size());
-  }
-
-  const std::vector<Argument>& inArgs = dataBatch.getStreams();
-
-  PassType passType = parameterUpdater_->startBatch(actualBatchSize);
-
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    REGISTER_TIMER("prefetch");
-    gradientMachine_->prefetch(inArgs);
-    parameterUpdater_->getParametersRemote();
-  }
-
-  UpdateCallback updateCallback = [this, showStats, &paraStats](
-      Parameter* para) {
-    if (showStats) {
-      //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
-      // it
-      //! to ParameterHook.
-      auto& grad = para->getBuf(PARAMETER_GRADIENT);
-      SetDevice device(para->getDeviceId());
-      paraStats[para->getID()].avgAbsGrad = grad->getAbsSum() / para->getSize();
-      paraStats[para->getID()].maxAbsGrad = grad->getAbsMax();
-    }
-    parameterUpdater_->update(para);
-  };
-
-  {
-#ifndef PADDLE_DISABLE_TIMER
-    Timer timer;
-    timer.start();
-#endif
-    REGISTER_TIMER("forwardBackward");
-    forwardBackwardBatch(
-        inArgs, *outArgs, passType, updateCallback, doPipelineUpdate);
-#ifndef PADDLE_DISABLE_TIMER
-    timer.stop();
-    parameterUpdater_->setForwardbackwardTime(timer.get());
-#endif
-  }
-
-  if (!doPipelineUpdate) {
-    auto& parameters = gradientMachine_->getNonStaticParameters();
-    for (auto& para : parameters) {
-      updateCallback(para.get());
-    }
-  }
-
-  real cost = 0;
-  {
-    REGISTER_TIMER("sumCost");
-    cost = Argument::sum(*outArgs);
-  }
-
-  if (batchId % intconfig_->log_period == 0) {
-    currentEvaluator_->start();
-    stats_->resetCurrentStat();
-  }
-  {
-    REGISTER_TIMER("eval");
-    gradientMachine_->eval(currentEvaluator_);
-    gradientMachine_->eval(evaluator_);
-  }
-
-  *stats_ += {actualBatchSize, cost};
-  {
-    REGISTER_TIMER("finishBatch");
-    parameterUpdater_->finishBatch(cost);
-  }
-
-  if (showStats) {
-    showParameterStats(paraStats);
-  }
-  if ((batchId + 1) % intconfig_->log_period == 0) {
-    currentEvaluator_->finish();
-
-    if (intconfig_->dot_period > 0) {
-      std::cerr << std::endl;
-    }
-    LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_
-              << " Eval: " << *evaluator_
-              << " CurrentEval: " << *currentEvaluator_;
-  } else if (intconfig_->dot_period > 0 &&
-             (batchId + 1) % intconfig_->dot_period == 0) {
-    std::cerr << ".";
-  }
-}
-
-/**
- * finish train pass
- */
-void TrainerInternal::finishTrainPass(int passId, int batchId) {
-  gradientMachine_->onPassEnd();
-  parameterUpdater_->finishPass();
-  evaluator_->finish();
-  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " "
-            << stats_->getStats(false /*without current cost*/)
-            << " Eval: " << *evaluator_;
-}
-
-void TrainerInternal::showParameterStats(
-    const std::vector<ParaStat>& paraStats) {
-  std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
-  for (auto& parameter : parameters) {
-    SetDevice device(parameter->getDeviceId());
-    real sum = parameter->getBuf(PARAMETER_VALUE)->getAbsSum();
-    const auto& lr = parameter->getBuf(PARAMETER_LEARNING_RATE);
-    std::ostringstream osLrHistogram;
-    if (lr) {
-      if (VLOG_IS_ON(2)) {
-        osLrHistogram << " lr_histogram: ";
-        lr->histogram(osLrHistogram);
-      } else {
-        osLrHistogram << " max_lr=" << std::setw(11) << lr->getMax()
-                      << " min_lr=" << std::setw(11) << lr->getMin()
-                      << " avg_lr=" << std::setw(11)
-                      << lr->getSum() / parameter->getSize();
-      }
-    }
-    int pid = parameter->getID();
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-              << std::setw(20) << parameter->getName()
-              << " avg_abs_val=" << std::setw(11) << sum / parameter->getSize()
-              << " max_val=" << std::setw(11)
-              << parameter->getBuf(PARAMETER_VALUE)->getAbsMax()
-              << " avg_abs_grad=" << std::setw(11) << paraStats[pid].avgAbsGrad
-              << " max_grad=" << std::setw(11) << paraStats[pid].maxAbsGrad
-              << osLrHistogram.str();
-  }
-}
-
-void TrainerInternal::createParameterUpdater(bool testing) {
-  const std::string& alg = config_->getOptConfig().algorithm();
-  parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
-      alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes));
-  if (parameterUpdater_) {
-    return;
-  }
-
-  if (!intconfig_->local) {
-    if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
-      std::unique_ptr<ParameterUpdater> localUpdater;
-      localUpdater.reset(
-          new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
-      parameterUpdater_.reset(
-          new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(),
-                                                    intconfig_->num_passes,
-                                                    testing,
-                                                    std::move(localUpdater)));
-    } else {
-      if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
-          !intconfig_->use_old_updater) {
-        intconfig_->use_old_updater = true;
-        LOG(INFO) << "Sgd sparse training can not work with"
-                  << " ConcurrentRemoteParameterUpdater,"
-                  << " automatically reset --use_old_updater=true";
-      }
-
-      std::unique_ptr<ParameterUpdater> localUpdater;
-      if (config_->getOptConfig().num_batches_per_send_parameter() > 1) {
-        CHECK(alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD)
-            << "Unsupported algorithm in remote-local mode: " << alg;
-        if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
-          localUpdater.reset(new SgdThreadUpdater(*config_));
-        } else {
-          localUpdater.reset(new SgdLocalUpdater(*config_));
-        }
-      }
-
-      localUpdater.reset(
-          intconfig_->use_old_updater
-              ? new RemoteParameterUpdater(
-                    *config_, intconfig_->num_passes, std::move(localUpdater))
-              : new ConcurrentRemoteParameterUpdater(
-                    *config_, intconfig_->num_passes, std::move(localUpdater)));
-
-      if (config_->getOptConfig().use_sparse_remote_updater()) {
-        localUpdater.reset(
-            new SparseRemoteParameterUpdaterComposite(*config_,
-                                                      intconfig_->num_passes,
-                                                      testing,
-                                                      std::move(localUpdater)));
-      }
-
-      this->parameterUpdater_ = std::move(localUpdater);
-    }
-  } else {
-    CHECK_EQ(config_->getOptConfig().num_batches_per_send_parameter(), 1)
-        << "num_batches_per_send_parameter should be one in local mode!";
-
-    if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
-      parameterUpdater_.reset(new SgdThreadUpdater(*config_));
-    } else if (alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD) {
-      if (config_->getModelConfig().type() == "recursive_nn") {
-        parameterUpdater_.reset(new SgdCpuUpdater(*config_));
-      } else if (intconfig_->use_gpu &&
-                 config_->getOptConfig().do_average_in_cpu() &&
-                 config_->getOptConfig().average_window() > 0) {
-        parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_));
-      } else {
-        parameterUpdater_.reset(new SgdLocalUpdater(*config_));
-      }
-    } else {
-      LOG(FATAL) << "Unsupported algorithm in local mode: " << alg;
-    }
-  }
-}
-
-void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                           std::vector<Argument>& outArgs,
-                                           PassType& passType,
-                                           UpdateCallback updateCallback,
-                                           bool doPipelineUpdate) {
-  gradientMachine_->forwardBackward(
-      inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternal.h b/paddle/legacy/trainer/TrainerInternal.h
deleted file mode 100644
index 93919a68f..000000000
--- a/paddle/legacy/trainer/TrainerInternal.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <fstream>
-
-#include "ParameterUpdater.h"
-#include "TrainerConfig.pb.h"
-#include "TrainerConfigHelper.h"
-#include "TrainerInternalConfig.h"
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-namespace paddle {
-
-/**
- * TrainerInteral
- * the core training class for driving training logic
- */
-class TrainerInternal {
- public:
-  struct ParaStat {
-    real maxAbsGrad;
-    real avgAbsGrad;
-    ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {}
-  };
-
-  TrainerInternal() {}
-
-  /**
-   * Intializes trainer internal class
-   * @param config network config
-   * @param machine gradient machine
-   * @param intconfig training config
-   * @param stats training stats
-   * @param testing if it is in testing phase
-   */
-  void init(const std::shared_ptr<TrainerConfigHelper>& config,
-            const GradientMachinePtr& machine,
-            std::unique_ptr<TrainerInternalConfig>&& intconfig,
-            const std::shared_ptr<TrainerStats>& stats,
-            bool testing);
-
-  virtual ~TrainerInternal() {}
-
-  /**
-   * CreateParameterUpdater
-   * @param testing if it is in testing phase
-   */
-  void createParameterUpdater(bool testing);
-
-  /**
-   * FinishTrainPass
-   * @param passId current pass id
-   * @param batchId current batch id, starts from 0
-   */
-  void finishTrainPass(int passId, int batchId);
-
-  /**
-   * trainOneBatch
-   * @param batchId current batch id
-   * @param dataBatch data for the batch
-   */
-  void trainOneBatch(int64_t batchId,
-                     const DataBatch& dataBatch,
-                     std::vector<Argument>* outArgs);
-
-  /**
-   * showParameterStats
-   * @param paraStats training stats
-   */
-  void showParameterStats(const std::vector<ParaStat>& paraStats);
-
-  /**
-   * getGradientMachine
-   */
-  inline const GradientMachinePtr& getGradientMachine() const {
-    return gradientMachine_;
-  }
-
-  /**
-   * getParameterUpdater
-   */
-  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdater() {
-    return parameterUpdater_;
-  }
-
-  /**
-   * setCurrentEvaluator
-   * @param eval evaluator to set
-   */
-  inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; }
-
-  /**
-   * setEvaluator
-   * @param eval evaluator to set
-   */
-  inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; }
-
-  /**
-   * forwardBackwardBatch
-   * @param inArgs input argument for data batch
-   * @param outArgs output argument from neural network
-   * @param updateCallback layerwise parameter gradient statistics
-   * @param doPipelineUpdate whether to do pipeline update
-   */
-  virtual void forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                    std::vector<Argument>& outArgs,
-                                    PassType& passType,
-                                    UpdateCallback updateCallback,
-                                    bool doPipelineUpdate);
-
- protected:
-  std::shared_ptr<ParameterUpdater> parameterUpdater_;
-  GradientMachinePtr gradientMachine_;
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<TrainerInternalConfig> intconfig_;
-  std::shared_ptr<TrainerStats> stats_;
-  Evaluator* currentEvaluator_;
-  Evaluator* evaluator_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternalConfig.cpp b/paddle/legacy/trainer/TrainerInternalConfig.cpp
deleted file mode 100644
index 039fcdb52..000000000
--- a/paddle/legacy/trainer/TrainerInternalConfig.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerInternalConfig.h"
-
-DEFINE_int32(show_parameter_stats_period,
-             0,
-             "Whether to show parameter stats during training");
-
-DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
-
-DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
-
-DECLARE_int32(num_passes);
-
-DECLARE_bool(local);
-
-namespace paddle {
-
-std::unique_ptr<TrainerInternalConfig> TrainerInternalConfig::createFromMode(
-    GradientMachine::CreateMode mode) {
-  auto config = new TrainerInternalConfig();
-  config->mode = mode;
-  config->local = FLAGS_local;
-  config->use_gpu = FLAGS_use_gpu;
-  config->trainer_count = FLAGS_trainer_count;
-  config->show_param_stats_period = FLAGS_show_parameter_stats_period;
-  config->trainer_id = FLAGS_trainer_id;
-  config->log_period = FLAGS_log_period;
-  config->dot_period = FLAGS_dot_period;
-  config->num_passes = FLAGS_num_passes;
-  config->use_old_updater = FLAGS_use_old_updater;
-  config->loadsave_parameters_in_pserver = FLAGS_loadsave_parameters_in_pserver;
-
-  return std::unique_ptr<TrainerInternalConfig>(config);
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternalConfig.h b/paddle/legacy/trainer/TrainerInternalConfig.h
deleted file mode 100644
index b91b53932..000000000
--- a/paddle/legacy/trainer/TrainerInternalConfig.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/legacy/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include <sstream>
-#include "ParameterUpdater.h"
-
-namespace paddle {
-/**
- * @brief TrainerStats object will statistics sample processed and total cost.
- *
- * There are two stats in it, the 'AvgCost' and 'CurrentAvgCost'. 'AvgCost'
- * means cost through one pass(all mini-batches). 'CurrentAvgCost' means cost
- * through one mini-batch.
- */
-class TrainerStats {
- public:
-  /**
-   * @brief reset all stats.
-   *
-   * often used before pass start.
-   */
-  inline void reset() {
-    numProcessed_ = 0;
-    totalCost_ = .0;
-    this->resetCurrentStat();
-  }
-
-  /**
-   * @brief reset current stat.
-   *
-   * 'current' means the most recent --log_period mini-batches
-   */
-  inline void resetCurrentStat() {
-    currentCost_ = .0;
-    currentSamples_ = 0;
-  }
-
-  /**
-   * @brief add cost to stat.
-   * @param numProcessed current mini-batch size
-   * @param cost current mini-batch cost
-   */
-  inline void addCost(int64_t numProcessed, real cost) {
-    this->numProcessed_ += numProcessed;
-    this->totalCost_ += cost;
-    this->currentSamples_ += numProcessed;
-    this->currentCost_ += cost;
-  }
-
-  /**
-   * @brief get average cost through on pass(all processed mini-batches)
-   * @return pass average cost
-   */
-  inline real getAvgCost() const {
-    CHECK_NE(this->numProcessed_, 0);
-    return this->totalCost_ / this->numProcessed_;
-  }
-
-  /**
-   * @brief get current mini-batch's average cost.
-   * @return mini-batch average cost
-   */
-  inline real getCurrentAvgCost() const {
-    CHECK_NE(this->currentSamples_, 0);
-    return this->currentCost_ / this->currentSamples_;
-  }
-
-  /**
-   * @brief get all processed samples' number
-   * @return all processed samples' number
-   */
-  inline int64_t getNumProcessed() const { return this->numProcessed_; }
-
-  /**
-   * @brief same function as addCost. But it is simple to invoke.
-   * For example:
-   *
-   * @code{.cpp}
-   * TrainerStats stat;
-   * cost = neuralNetwork.forward(batchSize);
-   * stat += {batchSize, cost};
-   * @endcode
-   *
-   * @param p a pair of parameter, first is numProcessed, second is cost.
-   * @return *this
-   */
-  inline TrainerStats& operator+=(const std::pair<int64_t, real>& p) {
-    this->addCost(p.first, p.second);
-    return *this;
-  }
-
-  /**
-   * @brief TrainerStats Constructor.
-   *
-   * reset stat when constructed.
-   */
-  inline TrainerStats() { this->reset(); }
-
-  /**
-   * @brief show stats to ostream.
-   *
-   * If there is no need to print current cost, set withCurrentCost to False.
-   *
-   * @param os output stream.
-   * @param withCurrentCost print current cost or not.
-   */
-  void showStats(std::ostream& os, bool withCurrentCost = true) const {
-    os << "samples=" << this->getNumProcessed()
-       << " AvgCost=" << this->getAvgCost();
-    if (withCurrentCost) {
-      os << " CurrentCost=" << this->getCurrentAvgCost();
-    }
-  }
-
-  /**
-   * @brief get stats to std::string
-   * @param withCurrentCost return current cost or not
-   * @return stats string
-   */
-  std::string getStats(bool withCurrentCost = true) const {
-    std::ostringstream os;
-    this->showStats(os, withCurrentCost);
-    return os.str();
-  }
-
- private:
-  int64_t numProcessed_;
-  real totalCost_;
-  real currentCost_;
-  int64_t currentSamples_;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const TrainerStats& stats) {
-  stats.showStats(os);
-  return os;
-}
-
-/**
- * TrainerInternalConfig
- * general configs for training
- */
-struct TrainerInternalConfig {
-  /**
-   * @brief Create TrainerInternalConfig from GradientMachine::CreateMode and
-   * command line arguments.
-   * @param mode
-   * @return
-   */
-  static std::unique_ptr<TrainerInternalConfig> createFromMode(
-      GradientMachine::CreateMode mode);
-
-  /**
-   * indicate whether the training is local
-   * if local, no parameter server is used
-   */
-  bool local;
-
-  /**
-   * indicate whether training uses GPU
-   */
-  bool use_gpu;
-
-  /**
-   * indicate number of trainer
-   */
-  int trainer_count;
-
-  /**
-   * how frequently to show param stats
-   */
-  int show_param_stats_period;
-
-  /**
-   * current trainer id
-   */
-  int trainer_id;
-
-  /**
-   * frequency to dump log
-   */
-  int log_period;
-
-  /**
-   * dot period
-   */
-  int dot_period;
-
-  /**
-   * num passes for training
-   */
-  int num_passes;
-
-  /**
-   * use old updater
-   */
-  bool use_old_updater;
-
-  /**
-   * whether to load and save parameter in pserver
-   */
-  bool loadsave_parameters_in_pserver;
-
-  /**
-   * training mode
-   */
-  GradientMachine::CreateMode mode;
-};
-
-}  //  namespace paddle
diff --git a/paddle/legacy/trainer/TrainerMain.cpp b/paddle/legacy/trainer/TrainerMain.cpp
deleted file mode 100644
index 911aeba19..000000000
--- a/paddle/legacy/trainer/TrainerMain.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fenv.h>
-#include "paddle/legacy/pserver/ParameterServerController.h"
-#include "paddle/legacy/utils/PythonUtil.h"
-
-#include "ParamUtil.h"
-#include "Trainer.h"
-
-DEFINE_bool(start_pserver, false, "Whether to start pserver");
-DECLARE_int32(gpu_id);
-DEFINE_string(job, "train", "one of (train, test, checkgrad)");
-DECLARE_int32(start_pass);
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_string(rdma_tcp);
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  // write logs instantly (never buffer log messages)
-  FLAGS_logbuflevel = -1;
-
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
-  if (FLAGS_start_pserver) {
-    parameterServerPtr.reset(
-        paddle::ParameterServerController::createFromGflags());
-    parameterServerPtr->start();
-  }
-  Trainer trainer;
-  auto config = TrainerConfigHelper::createFromFlags();
-  CHECK(config != nullptr) << "no valid config";
-
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-  trainer.init(config, FLAGS_job == "test");
-
-  if (FLAGS_job == "train") {
-    trainer.train();
-  } else if (FLAGS_job == "checkgrad") {
-    trainer.checkGradient();
-  } else if (FLAGS_job == "test") {
-    trainer.test();
-  } else if (FLAGS_job == "time") {
-    trainer.time();
-  } else {
-    LOG(FATAL) << "Unknown job type: " << FLAGS_job;
-  }
-
-  return 0;
-}
diff --git a/paddle/legacy/trainer/tests/.gitignore b/paddle/legacy/trainer/tests/.gitignore
deleted file mode 100644
index aedb0ef22..000000000
--- a/paddle/legacy/trainer/tests/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-dump_text.test
-test_pydata_provider_wrapper.json
-*proto.bin
diff --git a/paddle/legacy/trainer/tests/CMakeLists.txt b/paddle/legacy/trainer/tests/CMakeLists.txt
deleted file mode 100644
index fbefcced5..000000000
--- a/paddle/legacy/trainer/tests/CMakeLists.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
-    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
-)
-add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
-
-set(PYTHON_PATH 
-   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/trainer/tests)
-function(trainer_test TARGET)
-  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-  add_test(NAME ${TARGET}
-    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endfunction()
-
-trainer_test(test_Compare)
-trainer_test(test_PyDataProviderWrapper)
-trainer_test(test_recurrent_machine_generation)
-if(NOT APPLE)
-  trainer_test(test_Trainer)
-else()
-  message(WARNING "These tests has been disabled in OSX for random fail: \n test_Trainer") 
-endif()
-
-############### test_TrainerOnePass ##########################
-if(WITH_PYTHON)
-  # only run test_TrainerOnePass when PYTHON is enabled, because train one pass
-  # is using PyDataProvider2.
-  add_unittest_without_exec(test_TrainerOnePass
-      test_TrainerOnePass.cpp)
-  add_test(NAME test_TrainerOnePass
-    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
-          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
-endif()
-
-#################### test_config_parser #########################
-add_test(NAME test_config_parser
-  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
-        ${PADDLE_SOURCE_DIR}/paddle/legacy/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
diff --git a/paddle/legacy/trainer/tests/__init__.py b/paddle/legacy/trainer/tests/__init__.py
deleted file mode 100644
index f662d6826..000000000
--- a/paddle/legacy/trainer/tests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/paddle/legacy/trainer/tests/config_parser_test.py b/paddle/legacy/trainer/tests/config_parser_test.py
deleted file mode 100644
index 0d3d82cbd..000000000
--- a/paddle/legacy/trainer/tests/config_parser_test.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config_and_serialize
-
-if __name__ == '__main__':
-    parse_config_and_serialize('legacy/trainer/tests/test_config.conf', '')
-    parse_config_and_serialize(
-        'legacy/trainer/tests/sample_trainer_config.conf',
-        'extension_module_name=paddle.trainer.config_parser_extension')
-    parse_config_and_serialize(
-        'legacy/gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/legacy/trainer/tests/fake_file_list.list b/paddle/legacy/trainer/tests/fake_file_list.list
deleted file mode 100644
index f27ceed27..000000000
--- a/paddle/legacy/trainer/tests/fake_file_list.list
+++ /dev/null
@@ -1 +0,0 @@
-do_not_matter.txt
diff --git a/paddle/legacy/trainer/tests/picojson.h b/paddle/legacy/trainer/tests/picojson.h
deleted file mode 100644
index 75349537b..000000000
--- a/paddle/legacy/trainer/tests/picojson.h
+++ /dev/null
@@ -1,1103 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * Copyright 2009-2010 Cybozu Labs, Inc.
- * Copyright 2011-2014 Kazuho Oku
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef picojson_h
-#define picojson_h
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <map>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-// for isnan/isinf
-#if __cplusplus >= 201103L
-#include <cmath>
-#else
-extern "C" {
-#ifdef _MSC_VER
-#include <float.h>
-#elif defined(__INTEL_COMPILER)
-#include <mathimf.h>
-#else
-#include <math.h>
-#endif
-}
-#endif
-
-// experimental support for int64_t (see README.mkdn for detail)
-#ifdef PICOJSON_USE_INT64
-#define __STDC_FORMAT_MACROS
-#include <errno.h>
-#include <inttypes.h>
-#endif
-
-// to disable the use of localeconv(3), set PICOJSON_USE_LOCALE to 0
-#ifndef PICOJSON_USE_LOCALE
-#define PICOJSON_USE_LOCALE 1
-#endif
-#if PICOJSON_USE_LOCALE
-extern "C" {
-#include <locale.h>
-}
-#endif
-
-#ifndef PICOJSON_ASSERT
-#define PICOJSON_ASSERT(e)                  \
-  do {                                      \
-    if (!(e)) throw std::runtime_error(#e); \
-  } while (0)
-#endif
-
-#ifdef _MSC_VER
-#define SNPRINTF _snprintf_s
-#pragma warning(push)
-#pragma warning(disable : 4244)  // conversion from int to char
-#pragma warning(disable : 4127)  // conditional expression is constant
-#pragma warning(disable : 4702)  // unreachable code
-#else
-#define SNPRINTF snprintf
-#endif
-
-namespace picojson {
-
-enum {
-  null_type,
-  boolean_type,
-  number_type,
-  string_type,
-  array_type,
-  object_type
-#ifdef PICOJSON_USE_INT64
-  ,
-  int64_type
-#endif
-};
-
-enum { INDENT_WIDTH = 2 };
-
-struct null {};
-
-class value {
- public:
-  typedef std::vector<value> array;
-  typedef std::map<std::string, value> object;
-  union _storage {
-    bool boolean_;
-    double number_;
-#ifdef PICOJSON_USE_INT64
-    int64_t int64_;
-#endif
-    std::string* string_;
-    array* array_;
-    object* object_;
-  };
-
- protected:
-  int type_;
-  _storage u_;
-
- public:
-  value();
-  value(int type, bool);
-  explicit value(bool b);
-#ifdef PICOJSON_USE_INT64
-  explicit value(int64_t i);
-#endif
-  explicit value(double n);
-  explicit value(const std::string& s);
-  explicit value(const array& a);
-  explicit value(const object& o);
-  explicit value(const char* s);
-  value(const char* s, size_t len);
-  ~value();
-  value(const value& x);
-  value& operator=(const value& x);
-  void swap(value& x);
-  template <typename T>
-  bool is() const;
-  template <typename T>
-  const T& get() const;
-  template <typename T>
-  T& get();
-  bool evaluate_as_boolean() const;
-  const value& get(size_t idx) const;
-  const value& get(const std::string& key) const;
-  value& get(size_t idx);
-  value& get(const std::string& key);
-
-  bool contains(size_t idx) const;
-  bool contains(const std::string& key) const;
-  std::string to_str() const;
-  template <typename Iter>
-  void serialize(Iter os, bool prettify = false) const;
-  std::string serialize(bool prettify = false) const;
-
- private:
-  template <typename T>
-  value(const T*);  // intentionally defined to block implicit conversion of
-                    // pointer to bool
-  template <typename Iter>
-  static void _indent(Iter os, int indent);
-  template <typename Iter>
-  void _serialize(Iter os, int indent) const;
-  std::string _serialize(int indent) const;
-};
-
-typedef value::array array;
-typedef value::object object;
-
-inline value::value() : type_(null_type) {}
-
-inline value::value(int type, bool) : type_(type) {
-  switch (type) {
-#define INIT(p, v) \
-  case p##type:    \
-    u_.p = v;      \
-    break
-    INIT(boolean_, false);
-    INIT(number_, 0.0);
-#ifdef PICOJSON_USE_INT64
-    INIT(int64_, 0);
-#endif
-    INIT(string_, new std::string());
-    INIT(array_, new array());
-    INIT(object_, new object());
-#undef INIT
-    default:
-      break;
-  }
-}
-
-inline value::value(bool b) : type_(boolean_type) { u_.boolean_ = b; }
-
-#ifdef PICOJSON_USE_INT64
-inline value::value(int64_t i) : type_(int64_type) { u_.int64_ = i; }
-#endif
-
-inline value::value(double n) : type_(number_type) {
-  if (
-#ifdef _MSC_VER
-      !_finite(n)
-#elif __cplusplus >= 201103L || !(defined(isnan) && defined(isinf))
-      std::isnan(n) || std::isinf(n)
-#else
-      isnan(n) || isinf(n)
-#endif
-          ) {
-    throw std::overflow_error("");
-  }
-  u_.number_ = n;
-}
-
-inline value::value(const std::string& s) : type_(string_type) {
-  u_.string_ = new std::string(s);
-}
-
-inline value::value(const array& a) : type_(array_type) {
-  u_.array_ = new array(a);
-}
-
-inline value::value(const object& o) : type_(object_type) {
-  u_.object_ = new object(o);
-}
-
-inline value::value(const char* s) : type_(string_type) {
-  u_.string_ = new std::string(s);
-}
-
-inline value::value(const char* s, size_t len) : type_(string_type) {
-  u_.string_ = new std::string(s, len);
-}
-
-inline value::~value() {
-  switch (type_) {
-#define DEINIT(p) \
-  case p##type:   \
-    delete u_.p;  \
-    break
-    DEINIT(string_);
-    DEINIT(array_);
-    DEINIT(object_);
-#undef DEINIT
-    default:
-      break;
-  }
-}
-
-inline value::value(const value& x) : type_(x.type_) {
-  switch (type_) {
-#define INIT(p, v) \
-  case p##type:    \
-    u_.p = v;      \
-    break
-    INIT(string_, new std::string(*x.u_.string_));
-    INIT(array_, new array(*x.u_.array_));
-    INIT(object_, new object(*x.u_.object_));
-#undef INIT
-    default:
-      u_ = x.u_;
-      break;
-  }
-}
-
-inline value& value::operator=(const value& x) {
-  if (this != &x) {
-    value t(x);
-    swap(t);
-  }
-  return *this;
-}
-
-inline void value::swap(value& x) {
-  std::swap(type_, x.type_);
-  std::swap(u_, x.u_);
-}
-
-#define IS(ctype, jtype)                 \
-  template <>                            \
-  inline bool value::is<ctype>() const { \
-    return type_ == jtype##_type;        \
-  }
-IS(null, null)
-IS(bool, boolean)
-#ifdef PICOJSON_USE_INT64
-IS(int64_t, int64)
-#endif
-IS(std::string, string)
-IS(array, array)
-IS(object, object)
-#undef IS
-template <>
-inline bool value::is<double>() const {
-  return type_ == number_type
-#ifdef PICOJSON_USE_INT64
-         || type_ == int64_type
-#endif
-      ;
-}
-
-#define GET(ctype, var)                                                    \
-  template <>                                                              \
-  inline const ctype& value::get<ctype>() const {                          \
-    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
-                    is<ctype>());                                          \
-    return var;                                                            \
-  }                                                                        \
-  template <>                                                              \
-  inline ctype& value::get<ctype>() {                                      \
-    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
-                    is<ctype>());                                          \
-    return var;                                                            \
-  }
-GET(bool, u_.boolean_)
-GET(std::string, *u_.string_)
-GET(array, *u_.array_)
-GET(object, *u_.object_)
-#ifdef PICOJSON_USE_INT64
-GET(double,
-    (type_ == int64_type && (const_cast<value*>(this)->type_ = number_type,
-                             const_cast<value*>(this)->u_.number_ = u_.int64_),
-     u_.number_))
-GET(int64_t, u_.int64_)
-#else
-GET(double, u_.number_)
-#endif
-#undef GET
-
-inline bool value::evaluate_as_boolean() const {
-  switch (type_) {
-    case null_type:
-      return false;
-    case boolean_type:
-      return u_.boolean_;
-    case number_type:
-      return u_.number_ != 0;
-#ifdef PICOJSON_USE_INT64
-    case int64_type:
-      return u_.int64_ != 0;
-#endif
-    case string_type:
-      return !u_.string_->empty();
-    default:
-      return true;
-  }
-}
-
-inline const value& value::get(size_t idx) const {
-  static value s_null;
-  PICOJSON_ASSERT(is<array>());
-  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
-}
-
-inline value& value::get(size_t idx) {
-  static value s_null;
-  PICOJSON_ASSERT(is<array>());
-  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
-}
-
-inline const value& value::get(const std::string& key) const {
-  static value s_null;
-  PICOJSON_ASSERT(is<object>());
-  object::const_iterator i = u_.object_->find(key);
-  return i != u_.object_->end() ? i->second : s_null;
-}
-
-inline value& value::get(const std::string& key) {
-  static value s_null;
-  PICOJSON_ASSERT(is<object>());
-  object::iterator i = u_.object_->find(key);
-  return i != u_.object_->end() ? i->second : s_null;
-}
-
-inline bool value::contains(size_t idx) const {
-  PICOJSON_ASSERT(is<array>());
-  return idx < u_.array_->size();
-}
-
-inline bool value::contains(const std::string& key) const {
-  PICOJSON_ASSERT(is<object>());
-  object::const_iterator i = u_.object_->find(key);
-  return i != u_.object_->end();
-}
-
-inline std::string value::to_str() const {
-  switch (type_) {
-    case null_type:
-      return "null";
-    case boolean_type:
-      return u_.boolean_ ? "true" : "false";
-#ifdef PICOJSON_USE_INT64
-    case int64_type: {
-      char buf[sizeof("-9223372036854775808")];
-      SNPRINTF(buf, sizeof(buf), "%" PRId64, u_.int64_);
-      return buf;
-    }
-#endif
-    case number_type: {
-      char buf[256];
-      double tmp;
-      SNPRINTF(buf,
-               sizeof(buf),
-               fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0
-                   ? "%.f"
-                   : "%.17g",
-               u_.number_);
-#if PICOJSON_USE_LOCALE
-      char* decimal_point = localeconv()->decimal_point;
-      if (strcmp(decimal_point, ".") != 0) {
-        size_t decimal_point_len = strlen(decimal_point);
-        for (char* p = buf; *p != '\0'; ++p) {
-          if (strncmp(p, decimal_point, decimal_point_len) == 0) {
-            return std::string(buf, p) + "." + (p + decimal_point_len);
-          }
-        }
-      }
-#endif
-      return buf;
-    }
-    case string_type:
-      return *u_.string_;
-    case array_type:
-      return "array";
-    case object_type:
-      return "object";
-    default:
-      PICOJSON_ASSERT(0);
-#ifdef _MSC_VER
-      __assume(0);
-#endif
-  }
-  return std::string();
-}
-
-template <typename Iter>
-void copy(const std::string& s, Iter oi) {
-  std::copy(s.begin(), s.end(), oi);
-}
-
-template <typename Iter>
-void serialize_str(const std::string& s, Iter oi) {
-  *oi++ = '"';
-  for (std::string::const_iterator i = s.begin(); i != s.end(); ++i) {
-    switch (*i) {
-#define MAP(val, sym) \
-  case val:           \
-    copy(sym, oi);    \
-    break
-      MAP('"', "\\\"");
-      MAP('\\', "\\\\");
-      MAP('/', "\\/");
-      MAP('\b', "\\b");
-      MAP('\f', "\\f");
-      MAP('\n', "\\n");
-      MAP('\r', "\\r");
-      MAP('\t', "\\t");
-#undef MAP
-      default:
-        if (static_cast<unsigned char>(*i) < 0x20 || *i == 0x7f) {
-          char buf[7];
-          SNPRINTF(buf, sizeof(buf), "\\u%04x", *i & 0xff);
-          copy(buf, buf + 6, oi);
-        } else {
-          *oi++ = *i;
-        }
-        break;
-    }
-  }
-  *oi++ = '"';
-}
-
-template <typename Iter>
-void value::serialize(Iter oi, bool prettify) const {
-  return _serialize(oi, prettify ? 0 : -1);
-}
-
-inline std::string value::serialize(bool prettify) const {
-  return _serialize(prettify ? 0 : -1);
-}
-
-template <typename Iter>
-void value::_indent(Iter oi, int indent) {
-  *oi++ = '\n';
-  for (int i = 0; i < indent * INDENT_WIDTH; ++i) {
-    *oi++ = ' ';
-  }
-}
-
-template <typename Iter>
-void value::_serialize(Iter oi, int indent) const {
-  switch (type_) {
-    case string_type:
-      serialize_str(*u_.string_, oi);
-      break;
-    case array_type: {
-      *oi++ = '[';
-      if (indent != -1) {
-        ++indent;
-      }
-      for (array::const_iterator i = u_.array_->begin(); i != u_.array_->end();
-           ++i) {
-        if (i != u_.array_->begin()) {
-          *oi++ = ',';
-        }
-        if (indent != -1) {
-          _indent(oi, indent);
-        }
-        i->_serialize(oi, indent);
-      }
-      if (indent != -1) {
-        --indent;
-        if (!u_.array_->empty()) {
-          _indent(oi, indent);
-        }
-      }
-      *oi++ = ']';
-      break;
-    }
-    case object_type: {
-      *oi++ = '{';
-      if (indent != -1) {
-        ++indent;
-      }
-      for (object::const_iterator i = u_.object_->begin();
-           i != u_.object_->end();
-           ++i) {
-        if (i != u_.object_->begin()) {
-          *oi++ = ',';
-        }
-        if (indent != -1) {
-          _indent(oi, indent);
-        }
-        serialize_str(i->first, oi);
-        *oi++ = ':';
-        if (indent != -1) {
-          *oi++ = ' ';
-        }
-        i->second._serialize(oi, indent);
-      }
-      if (indent != -1) {
-        --indent;
-        if (!u_.object_->empty()) {
-          _indent(oi, indent);
-        }
-      }
-      *oi++ = '}';
-      break;
-    }
-    default:
-      copy(to_str(), oi);
-      break;
-  }
-  if (indent == 0) {
-    *oi++ = '\n';
-  }
-}
-
-inline std::string value::_serialize(int indent) const {
-  std::string s;
-  _serialize(std::back_inserter(s), indent);
-  return s;
-}
-
-template <typename Iter>
-class input {
- protected:
-  Iter cur_, end_;
-  int last_ch_;
-  bool ungot_;
-  int line_;
-
- public:
-  input(const Iter& first, const Iter& last)
-      : cur_(first), end_(last), last_ch_(-1), ungot_(false), line_(1) {}
-  int getc() {
-    if (ungot_) {
-      ungot_ = false;
-      return last_ch_;
-    }
-    if (cur_ == end_) {
-      last_ch_ = -1;
-      return -1;
-    }
-    if (last_ch_ == '\n') {
-      line_++;
-    }
-    last_ch_ = *cur_ & 0xff;
-    ++cur_;
-    return last_ch_;
-  }
-  void ungetc() {
-    if (last_ch_ != -1) {
-      PICOJSON_ASSERT(!ungot_);
-      ungot_ = true;
-    }
-  }
-  Iter cur() const { return cur_; }
-  int line() const { return line_; }
-  void skip_ws() {
-    while (1) {
-      int ch = getc();
-      if (!(ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')) {
-        ungetc();
-        break;
-      }
-    }
-  }
-  bool expect(int expect) {
-    skip_ws();
-    if (getc() != expect) {
-      ungetc();
-      return false;
-    }
-    return true;
-  }
-  bool match(const std::string& pattern) {
-    for (std::string::const_iterator pi(pattern.begin()); pi != pattern.end();
-         ++pi) {
-      if (getc() != *pi) {
-        ungetc();
-        return false;
-      }
-    }
-    return true;
-  }
-};
-
-template <typename Iter>
-inline int _parse_quadhex(input<Iter>& in) {
-  int uni_ch = 0, hex;
-  for (int i = 0; i < 4; i++) {
-    if ((hex = in.getc()) == -1) {
-      return -1;
-    }
-    if ('0' <= hex && hex <= '9') {
-      hex -= '0';
-    } else if ('A' <= hex && hex <= 'F') {
-      hex -= 'A' - 0xa;
-    } else if ('a' <= hex && hex <= 'f') {
-      hex -= 'a' - 0xa;
-    } else {
-      in.ungetc();
-      return -1;
-    }
-    uni_ch = uni_ch * 16 + hex;
-  }
-  return uni_ch;
-}
-
-template <typename String, typename Iter>
-inline bool _parse_codepoint(String& out, input<Iter>& in) {
-  int uni_ch;
-  if ((uni_ch = _parse_quadhex(in)) == -1) {
-    return false;
-  }
-  if (0xd800 <= uni_ch && uni_ch <= 0xdfff) {
-    if (0xdc00 <= uni_ch) {
-      // a second 16-bit of a surrogate pair appeared
-      return false;
-    }
-    // first 16-bit of surrogate pair, get the next one
-    if (in.getc() != '\\' || in.getc() != 'u') {
-      in.ungetc();
-      return false;
-    }
-    int second = _parse_quadhex(in);
-    if (!(0xdc00 <= second && second <= 0xdfff)) {
-      return false;
-    }
-    uni_ch = ((uni_ch - 0xd800) << 10) | ((second - 0xdc00) & 0x3ff);
-    uni_ch += 0x10000;
-  }
-  if (uni_ch < 0x80) {
-    out.push_back(uni_ch);
-  } else {
-    if (uni_ch < 0x800) {
-      out.push_back(0xc0 | (uni_ch >> 6));
-    } else {
-      if (uni_ch < 0x10000) {
-        out.push_back(0xe0 | (uni_ch >> 12));
-      } else {
-        out.push_back(0xf0 | (uni_ch >> 18));
-        out.push_back(0x80 | ((uni_ch >> 12) & 0x3f));
-      }
-      out.push_back(0x80 | ((uni_ch >> 6) & 0x3f));
-    }
-    out.push_back(0x80 | (uni_ch & 0x3f));
-  }
-  return true;
-}
-
-template <typename String, typename Iter>
-inline bool _parse_string(String& out, input<Iter>& in) {
-  while (1) {
-    int ch = in.getc();
-    if (ch < ' ') {
-      in.ungetc();
-      return false;
-    } else if (ch == '"') {
-      return true;
-    } else if (ch == '\\') {
-      if ((ch = in.getc()) == -1) {
-        return false;
-      }
-      switch (ch) {
-#define MAP(sym, val)   \
-  case sym:             \
-    out.push_back(val); \
-    break
-        MAP('"', '\"');
-        MAP('\\', '\\');
-        MAP('/', '/');
-        MAP('b', '\b');
-        MAP('f', '\f');
-        MAP('n', '\n');
-        MAP('r', '\r');
-        MAP('t', '\t');
-#undef MAP
-        case 'u':
-          if (!_parse_codepoint(out, in)) {
-            return false;
-          }
-          break;
-        default:
-          return false;
-      }
-    } else {
-      out.push_back(ch);
-    }
-  }
-  return false;
-}
-
-template <typename Context, typename Iter>
-inline bool _parse_array(Context& ctx, input<Iter>& in) {
-  if (!ctx.parse_array_start()) {
-    return false;
-  }
-  size_t idx = 0;
-  if (in.expect(']')) {
-    return ctx.parse_array_stop(idx);
-  }
-  do {
-    if (!ctx.parse_array_item(in, idx)) {
-      return false;
-    }
-    idx++;
-  } while (in.expect(','));
-  return in.expect(']') && ctx.parse_array_stop(idx);
-}
-
-template <typename Context, typename Iter>
-inline bool _parse_object(Context& ctx, input<Iter>& in) {
-  if (!ctx.parse_object_start()) {
-    return false;
-  }
-  if (in.expect('}')) {
-    return true;
-  }
-  do {
-    std::string key;
-    if (!in.expect('"') || !_parse_string(key, in) || !in.expect(':')) {
-      return false;
-    }
-    if (!ctx.parse_object_item(in, key)) {
-      return false;
-    }
-  } while (in.expect(','));
-  return in.expect('}');
-}
-
-template <typename Iter>
-inline std::string _parse_number(input<Iter>& in) {
-  std::string num_str;
-  while (1) {
-    int ch = in.getc();
-    if (('0' <= ch && ch <= '9') || ch == '+' || ch == '-' || ch == 'e' ||
-        ch == 'E') {
-      num_str.push_back(ch);
-    } else if (ch == '.') {
-#if PICOJSON_USE_LOCALE
-      num_str += localeconv()->decimal_point;
-#else
-      num_str.push_back('.');
-#endif
-    } else {
-      in.ungetc();
-      break;
-    }
-  }
-  return num_str;
-}
-
-template <typename Context, typename Iter>
-inline bool _parse(Context& ctx, input<Iter>& in) {
-  in.skip_ws();
-  int ch = in.getc();
-  switch (ch) {
-#define IS(ch, text, op)        \
-  case ch:                      \
-    if (in.match(text) && op) { \
-      return true;              \
-    } else {                    \
-      return false;             \
-    }
-    IS('n', "ull", ctx.set_null());
-    IS('f', "alse", ctx.set_bool(false));
-    IS('t', "rue", ctx.set_bool(true));
-#undef IS
-    case '"':
-      return ctx.parse_string(in);
-    case '[':
-      return _parse_array(ctx, in);
-    case '{':
-      return _parse_object(ctx, in);
-    default:
-      if (('0' <= ch && ch <= '9') || ch == '-') {
-        double f;
-        char* endp;
-        in.ungetc();
-        std::string num_str = _parse_number(in);
-        if (num_str.empty()) {
-          return false;
-        }
-#ifdef PICOJSON_USE_INT64
-        {
-          errno = 0;
-          intmax_t ival = strtoimax(num_str.c_str(), &endp, 10);
-          if (errno == 0 && std::numeric_limits<int64_t>::min() <= ival &&
-              ival <= std::numeric_limits<int64_t>::max() &&
-              endp == num_str.c_str() + num_str.size()) {
-            ctx.set_int64(ival);
-            return true;
-          }
-        }
-#endif
-        f = strtod(num_str.c_str(), &endp);
-        if (endp == num_str.c_str() + num_str.size()) {
-          ctx.set_number(f);
-          return true;
-        }
-        return false;
-      }
-      break;
-  }
-  in.ungetc();
-  return false;
-}
-
-class deny_parse_context {
- public:
-  bool set_null() { return false; }
-  bool set_bool(bool) { return false; }
-#ifdef PICOJSON_USE_INT64
-  bool set_int64(int64_t) { return false; }
-#endif
-  bool set_number(double) { return false; }
-  template <typename Iter>
-  bool parse_string(input<Iter>&) {
-    return false;
-  }
-  bool parse_array_start() { return false; }
-  template <typename Iter>
-  bool parse_array_item(input<Iter>&, size_t) {
-    return false;
-  }
-  bool parse_array_stop(size_t) { return false; }
-  bool parse_object_start() { return false; }
-  template <typename Iter>
-  bool parse_object_item(input<Iter>&, const std::string&) {
-    return false;
-  }
-};
-
-class default_parse_context {
- protected:
-  value* out_;
-
- public:
-  default_parse_context(value* out) : out_(out) {}
-  bool set_null() {
-    *out_ = value();
-    return true;
-  }
-  bool set_bool(bool b) {
-    *out_ = value(b);
-    return true;
-  }
-#ifdef PICOJSON_USE_INT64
-  bool set_int64(int64_t i) {
-    *out_ = value(i);
-    return true;
-  }
-#endif
-  bool set_number(double f) {
-    *out_ = value(f);
-    return true;
-  }
-  template <typename Iter>
-  bool parse_string(input<Iter>& in) {
-    *out_ = value(string_type, false);
-    return _parse_string(out_->get<std::string>(), in);
-  }
-  bool parse_array_start() {
-    *out_ = value(array_type, false);
-    return true;
-  }
-  template <typename Iter>
-  bool parse_array_item(input<Iter>& in, size_t) {
-    array& a = out_->get<array>();
-    a.push_back(value());
-    default_parse_context ctx(&a.back());
-    return _parse(ctx, in);
-  }
-  bool parse_array_stop(size_t) { return true; }
-  bool parse_object_start() {
-    *out_ = value(object_type, false);
-    return true;
-  }
-  template <typename Iter>
-  bool parse_object_item(input<Iter>& in, const std::string& key) {
-    object& o = out_->get<object>();
-    default_parse_context ctx(&o[key]);
-    return _parse(ctx, in);
-  }
-
- private:
-  default_parse_context(const default_parse_context&);
-  default_parse_context& operator=(const default_parse_context&);
-};
-
-class null_parse_context {
- public:
-  struct dummy_str {
-    void push_back(int) {}
-  };
-
- public:
-  null_parse_context() {}
-  bool set_null() { return true; }
-  bool set_bool(bool) { return true; }
-#ifdef PICOJSON_USE_INT64
-  bool set_int64(int64_t) { return true; }
-#endif
-  bool set_number(double) { return true; }
-  template <typename Iter>
-  bool parse_string(input<Iter>& in) {
-    dummy_str s;
-    return _parse_string(s, in);
-  }
-  bool parse_array_start() { return true; }
-  template <typename Iter>
-  bool parse_array_item(input<Iter>& in, size_t) {
-    return _parse(*this, in);
-  }
-  bool parse_array_stop(size_t) { return true; }
-  bool parse_object_start() { return true; }
-  template <typename Iter>
-  bool parse_object_item(input<Iter>& in, const std::string&) {
-    return _parse(*this, in);
-  }
-
- private:
-  null_parse_context(const null_parse_context&);
-  null_parse_context& operator=(const null_parse_context&);
-};
-
-// obsolete, use the version below
-template <typename Iter>
-inline std::string parse(value& out, Iter& pos, const Iter& last) {
-  std::string err;
-  pos = parse(out, pos, last, &err);
-  return err;
-}
-
-template <typename Context, typename Iter>
-inline Iter _parse(Context& ctx,
-                   const Iter& first,
-                   const Iter& last,
-                   std::string* err) {
-  input<Iter> in(first, last);
-  if (!_parse(ctx, in) && err != NULL) {
-    char buf[64];
-    SNPRINTF(buf, sizeof(buf), "syntax error at line %d near: ", in.line());
-    *err = buf;
-    while (1) {
-      int ch = in.getc();
-      if (ch == -1 || ch == '\n') {
-        break;
-      } else if (ch >= ' ') {
-        err->push_back(ch);
-      }
-    }
-  }
-  return in.cur();
-}
-
-template <typename Iter>
-inline Iter parse(value& out,
-                  const Iter& first,
-                  const Iter& last,
-                  std::string* err) {
-  default_parse_context ctx(&out);
-  return _parse(ctx, first, last, err);
-}
-
-inline std::string parse(value& out, const std::string& s) {
-  std::string err;
-  parse(out, s.begin(), s.end(), &err);
-  return err;
-}
-
-inline std::string parse(value& out, std::istream& is) {
-  std::string err;
-  parse(out,
-        std::istreambuf_iterator<char>(is.rdbuf()),
-        std::istreambuf_iterator<char>(),
-        &err);
-  return err;
-}
-
-template <typename T>
-struct last_error_t {
-  static std::string s;
-};
-template <typename T>
-std::string last_error_t<T>::s;
-
-inline void set_last_error(const std::string& s) { last_error_t<bool>::s = s; }
-
-inline const std::string& get_last_error() { return last_error_t<bool>::s; }
-
-inline bool operator==(const value& x, const value& y) {
-  if (x.is<null>()) return y.is<null>();
-#define PICOJSON_CMP(type) \
-  if (x.is<type>()) return y.is<type>() && x.get<type>() == y.get<type>()
-  PICOJSON_CMP(bool);
-  PICOJSON_CMP(double);
-  PICOJSON_CMP(std::string);
-  PICOJSON_CMP(array);
-  PICOJSON_CMP(object);
-#undef PICOJSON_CMP
-  PICOJSON_ASSERT(0);
-#ifdef _MSC_VER
-  __assume(0);
-#endif
-  return false;
-}
-
-inline bool operator!=(const value& x, const value& y) { return !(x == y); }
-}  // namespace picojson
-
-namespace std {
-template <>
-inline void swap(picojson::value& x, picojson::value& y) {
-  x.swap(y);
-}
-}  // namespace std
-
-inline std::istream& operator>>(std::istream& is, picojson::value& x) {
-  picojson::set_last_error(std::string());
-  std::string err = picojson::parse(x, is);
-  if (!err.empty()) {
-    picojson::set_last_error(err);
-    is.setstate(std::ios::failbit);
-  }
-  return is;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const picojson::value& x) {
-  x.serialize(std::ostream_iterator<char>(os));
-  return os;
-}
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#endif
diff --git a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
deleted file mode 100644
index ed83e6ae8..000000000
--- a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
+++ /dev/null
@@ -1,2 +0,0 @@
-0;0 1 3 5;1 3.42 2.25;2 4:4.2 6:2.8;3 aa
-2;0 7 3 8;1 2.25 1.24;2 1:2.3 5:8.24;3 bb
diff --git a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
deleted file mode 100644
index 11c1b1b38..000000000
--- a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
+++ /dev/null
@@ -1 +0,0 @@
-legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
deleted file mode 100644
index 47401c949..000000000
--- a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
+++ /dev/null
@@ -1,60 +0,0 @@
-0
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-1
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-2
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-3
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-4
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-5
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-6
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-7
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-8
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-9
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-10
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-11
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-12
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-13
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
-14
-0	0	 1 2 3 4
-1	-0.2	 0 1 2 3 4
-
diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
deleted file mode 100644
index 02c7f142a..000000000
--- a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
+++ /dev/null
@@ -1,16 +0,0 @@
-0	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-	 1 2 3 4
-
diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
deleted file mode 100644
index 23bf1179e..000000000
--- a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
+++ /dev/null
@@ -1,16 +0,0 @@
-0	 1 2 3 4
-1	 1 2 3 4
-2	 1 2 3 4
-3	 1 2 3 4
-4	 1 2 3 4
-5	 1 2 3 4
-6	 1 2 3 4
-7	 1 2 3 4
-8	 1 2 3 4
-9	 1 2 3 4
-10	 1 2 3 4
-11	 1 2 3 4
-12	 1 2 3 4
-13	 1 2 3 4
-14	 1 2 3 4
-
diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
deleted file mode 100644
index 161624fbf795ac6188795a6350ab0887b53e6bba..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 116
wcmZQzU|?VYVo4wdfwO0P_CZ)D4lytw;|8qat5>bUDh@PbKg1jmiDJ%v0D;yY&;S4c

diff --git a/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
deleted file mode 100644
index 30ccf33d2e308ae12f1c719986d2a317344cf39b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 116
ZcmZQzU|?VYVo4x|fChUQ3zepxH~_{A1K9uo

diff --git a/paddle/legacy/trainer/tests/sample_data.txt b/paddle/legacy/trainer/tests/sample_data.txt
deleted file mode 100644
index 3398a38bd..000000000
--- a/paddle/legacy/trainer/tests/sample_data.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-0 1 2 -1
-2 3 -1 2
-1 2 2 1
-0 2 1 2
-1 3 1 2
-1 1 2 1
-0 3 -1 2
-1 -2 2 1
-2 2 1 2
-1 3 1 2
diff --git a/paddle/legacy/trainer/tests/sample_filelist.txt b/paddle/legacy/trainer/tests/sample_filelist.txt
deleted file mode 100644
index 8573f9e17..000000000
--- a/paddle/legacy/trainer/tests/sample_filelist.txt
+++ /dev/null
@@ -1 +0,0 @@
-legacy/trainer/tests/sample_data.txt
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config.conf b/paddle/legacy/trainer/tests/sample_trainer_config.conf
deleted file mode 100644
index 5800b3625..000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_config.conf
+++ /dev/null
@@ -1,87 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-            files = "legacy/trainer/tests/sample_filelist.txt",
-            feat_dim = 3,
-            context_len = 0,
-            buffer_capacity = 1000000))
-
-TestData(SimpleData(
-           files = "legacy/trainer/tests/sample_filelist.txt",
-           feat_dim = 3,
-           context_len = 0,
-           buffer_capacity = 1000000))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=9,
-               bias_attr=False,
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=3,
-               bias_attr=False,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=LinearActivation(),
-               param_attr=ParamAttr(name='sharew'))
-
-fc5 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=BReluActivation())
-
-fc6 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=SoftReluActivation())
-
-fc7 = fc_layer(input=data, size=3,
-               bias_attr=False,
-               act=SquareActivation())
-
-fc8 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               act=SquareActivation())
-
-with mixed_layer(size=3, act=SoftmaxActivation()) as layer9:
-    layer9 += full_matrix_projection(input=fc1)
-    layer9 += full_matrix_projection(input=fc2)
-    layer9 += full_matrix_projection(input=fc3)
-    layer9 += trans_full_matrix_projection(input=fc4,
-                                           param_attr=ParamAttr(name='sharew'))
-    layer9 += full_matrix_projection(input=fc5)
-    layer9 += full_matrix_projection(input=fc6)
-    layer9 += full_matrix_projection(input=fc7)
-    layer9 += full_matrix_projection(input=fc8)
-
-if get_config_arg('with_cost', bool, True):
-    # This is for training the neural network.
-    # We need to have another data layer for label
-    # and a layer for calculating cost
-    lbl = data_layer(name='label', size=1)
-    outputs(classification_cost(input=layer9, label=lbl))
-else:    
-    # This is for prediction where we don't have label
-    # and don't need to calculate cost
-    outputs(layer9)
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf b/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
deleted file mode 100644
index 155c40b31..000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
+++ /dev/null
@@ -1,53 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-    files = "legacy/trainer/tests/sample_filelist.txt",
-    feat_dim = 3,
-    context_len = 0,
-    buffer_capacity = 1000000,
-))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-fc1 = fc_layer(input=data, size=12,
-               bias_attr=False,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=19,
-               bias_attr=False,
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=LinearActivation())
-
-# This is for training the neural network.
-# We need to have another data layer for label
-# and a layer for calculating cost
-lbl = data_layer(name='label', size=1)
-
-outputs(hsigmoid(input=[fc1, fc2, fc3, fc4],
-                 label=lbl,
-                 num_classes=3))
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf b/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
deleted file mode 100644
index 49cdde7fa..000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
+++ /dev/null
@@ -1,86 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-            files = "legacy/trainer/tests/sample_filelist.txt",
-            feat_dim = 3,
-            context_len = 0,
-            buffer_capacity = 1000000))
-
-TestData(SimpleData(
-           files = "legacy/trainer/tests/sample_filelist.txt",
-           feat_dim = 3,
-           context_len = 0,
-           buffer_capacity = 1000000))
-
-settings(batch_size = 100)
-
-# Output layer, label layer, cost layer, preferably set to the same environment.
-output_device = 0
-
-# Input Layer does not need to specify the device number.
-data = data_layer(name='input', size=3)
-
-# Calculate in the CPU.
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=-1),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 0.
-fc2 = fc_layer(input=fc1, size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=0),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 1.
-fc3 = fc_layer(input=fc1, size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=1),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 0.
-fc4 = fc_layer(input=[fc2,fc3], size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=0),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 1.
-fc5 = fc_layer(input=[fc2,fc3], size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=1),
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc4,fc5], size=10,
-                  bias_attr=True,
-                  layer_attr=ExtraAttr(device=output_device),
-                  act=SoftmaxActivation())
-
-if get_config_arg('with_cost', bool, True):
-    # This is for training the neural network.
-    # We need to have another data layer for label
-    # and a layer for calculating cost
-    lbl = data_layer(name='label', size=1,
-                    layer_attr=ExtraAttr(device=output_device))
-                    
-    outputs(classification_cost(input=output, 
-                                label=lbl,
-                                layer_attr=ExtraAttr(device=output_device)))
-else:
-    # This is for prediction where we don't have label
-    # and don't need to calculate cost
-    outputs(output)
diff --git a/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
deleted file mode 100644
index 51ef905a5..000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ /dev/null
@@ -1,73 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=15, learning_rate=0)
-
-num_words = 5
-beam_flag = get_config_arg('beam_search', bool, False)
-
-sent_id = data_layer(name="sent_id", size=1)
-
-# This layer has no actual use, but only to decide batch_size in generation.
-# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-dummy_data = data_layer(name="dummy_data_input", size=2)
-
-def outer_step(dummy_data):
-
-    gen_inputs = [StaticInput(input=dummy_data, size=2, is_seq=True),
-                  GeneratedInput(size=num_words,
-                                 embedding_name="wordvec",
-                                 embedding_size=num_words)]
-
-    def inner_step(dummy_memory, predict_word):
-
-        # simplified RNN for testing
-        with mixed_layer(size=num_words) as layer:
-            layer += full_matrix_projection(input=predict_word,
-                                            param_attr=ParamAttr(name="transtable"))
-
-        with mixed_layer(size=num_words, act=ExpActivation()) as out:
-            out += trans_full_matrix_projection(input=layer,
-                                                param_attr=ParamAttr(name="wordvec"))
-
-        return out
-
-    beam_gen = beam_search(name="rnn_gen",
-                           step=inner_step,
-                           input=gen_inputs,
-                           bos_id=0,
-                           eos_id=num_words-1,
-                           beam_size=2 if beam_flag else 1,
-                           num_results_per_sample=1,
-                           max_length=10)
-    return beam_gen
-
-beam_gen_concat = recurrent_group(name="rnn_gen_concat",
-                                  step=outer_step,
-                                  input=[SubsequenceInput(dummy_data)])
-
-seqtext_printer_evaluator(input=beam_gen_concat,
-                          id_input=sent_id,
-                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
-                          result_file="./legacy/trainer/tests/dump_text.test")
-#outputs(beam_gen_concat)
-# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
-# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
-# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
-Inputs("sent_id","dummy_data_input")
-Outputs("__beam_search_predict__")
diff --git a/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
deleted file mode 100644
index 35c7f0fcd..000000000
--- a/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
+++ /dev/null
@@ -1,66 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=15, learning_rate=0)
-
-num_words = 5
-beam_flag = get_config_arg('beam_search', bool, False)
-
-sent_id = data_layer(name="sent_id", size=1)
-
-# This layer has no actual use, but only to decide batch_size in generation.
-# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-dummy_data = data_layer(name="dummy_data_input", size=2)
-
-gen_inputs = [StaticInput(input=dummy_data, size=2),
-              GeneratedInput(size=num_words,
-                             embedding_name="wordvec",
-                             embedding_size=num_words)]
-
-def step(dummy_memory, predict_word):
-
-    # simplified RNN for testing
-    with mixed_layer(size=num_words) as layer:
-        layer += full_matrix_projection(input=predict_word,
-                                        param_attr=ParamAttr(name="transtable"))
-
-    with mixed_layer(size=num_words, act=ExpActivation()) as out:
-        out += trans_full_matrix_projection(input=layer,
-                                            param_attr=ParamAttr(name="wordvec"))
-
-    return out
-
-beam_gen = beam_search(name="rnn_gen",
-                       step=step,
-                       input=gen_inputs,
-                       bos_id=0,
-                       eos_id=num_words-1,
-                       beam_size=2 if beam_flag else 1,
-                       num_results_per_sample=2 if beam_flag else 1,
-                       max_length=10)
-
-seqtext_printer_evaluator(input=beam_gen,
-                          id_input=sent_id,
-                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
-                          result_file="./legacy/trainer/tests/dump_text.test")
-#outputs(beam_gen)
-# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
-# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
-# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
-Inputs("sent_id","dummy_data_input")
-Outputs("__beam_search_predict__")
diff --git a/paddle/legacy/trainer/tests/simple_sparse_neural_network.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
deleted file mode 100644
index 9419f4d90..000000000
--- a/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
-
-file_list = 'legacy/trainer/tests/fake_file_list.list'
-
-define_py_data_sources2(
-    train_list=file_list,
-    test_list=file_list,
-    module="simple_sparse_neural_network_dp",
-    obj="process")
-
-embedding = embedding_layer(
-    input=data_layer(
-        name="word_ids", size=8191),
-    size=128,
-    param_attr=ParamAttr(sparse_update=True))
-prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
-
-outputs(
-    classification_cost(
-        input=prediction, label=data_layer(
-            name='label', size=10)))
diff --git a/paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
deleted file mode 100644
index 49043c917..000000000
--- a/paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import provider, integer_sequence, integer_value
-import random
-
-
-def init_hook(settings, is_train, **kwargs):
-    settings.is_train = is_train
-
-
-@provider(
-    input_types={'word_ids': integer_value(8191),
-                 'label': integer_value(10)},
-    min_pool_size=0,
-    init_hook=init_hook)
-def process(settings, filename):
-    if settings.is_train:
-        data_size = 2**10
-    else:
-        data_size = 2**5
-
-    for _ in xrange(data_size):
-        yield random.randint(0, 8190), random.randint(0, 9)
diff --git a/paddle/legacy/trainer/tests/testPyDataWrapper.py b/paddle/legacy/trainer/tests/testPyDataWrapper.py
deleted file mode 100644
index a76eeeacb..000000000
--- a/paddle/legacy/trainer/tests/testPyDataWrapper.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-sys.path.append("../")
-
-from paddle.trainer.PyDataProviderWrapper import *
-import random
-import json
-import string
-
-SPARSE_ID_LIMIT = 1000
-SPARSE_ID_COUNT = 100
-SEQUENCE_LIMIT = 50
-STRING_LIMIT = 10
-
-sparse_id_randomer = lambda: random.randrange(0, SPARSE_ID_LIMIT - 1)
-sparse_count_randomer = lambda: random.randrange(1, SPARSE_ID_COUNT)
-val_randomer = lambda: random.uniform(-1.0, 1.0)
-seq_count_randomer = lambda: random.randrange(1, SEQUENCE_LIMIT)
-str_count_randomer = lambda: random.randrange(1, STRING_LIMIT)
-
-
-class IDRandomer():  # A random generator, return unique id
-    def __init__(self):
-        self.id_set = set()
-
-    def __call__(self):
-        idx = sparse_id_randomer()
-        if idx not in self.id_set:
-            self.id_set.add(idx)
-            return idx
-        else:
-            return self.__call__()
-
-
-# SparseValueSlot
-def sparse_value_creator(_):
-    rand = IDRandomer()
-    return [(rand(), val_randomer()) for _ in xrange(sparse_count_randomer())]
-
-
-sparse_value = map(sparse_value_creator, range(seq_count_randomer()))
-
-
-# DenseSlot
-def dense_creator(_):
-    return [val_randomer() for _ in xrange(SPARSE_ID_LIMIT)]
-
-
-dense = map(dense_creator, range(seq_count_randomer()))
-
-
-# SparseNonValueSlot
-def sparse_creator(_):
-    rand = IDRandomer()
-    return [rand() for _ in xrange(sparse_count_randomer())]
-
-
-sparse_nonvalue = map(sparse_creator, range(seq_count_randomer()))
-
-# IndexSlot
-ids = [sparse_id_randomer() for _ in range(seq_count_randomer())]
-
-
-# StringSlot
-def random_str(size=8, chars=string.ascii_letters + string.digits):
-    return ''.join(random.choice(chars) for _ in range(size))
-
-
-strs = [random_str(str_count_randomer()) for _ in range(seq_count_randomer())]
-
-
-def processSeqAndGenerateDataInit(obj, *args, **kwargs):
-    obj.json_filename = kwargs.get("load_data_args", "test_data.json")
-
-
-@provider(
-    slots=[
-        SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
-        SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
-        StringSlot(SPARSE_ID_LIMIT)
-    ],
-    use_seq=True,
-    init_hook=processSeqAndGenerateDataInit)
-def processSeqAndGenerateData(obj, name):
-    retv = [sparse_value, dense, sparse_nonvalue, ids, strs]
-    # Write to protoseq.
-    with open(obj.json_filename, "w") as f:
-        json.dump(retv, f)
-    yield retv
-
-
-def processSubSeqAndGenerateDataInit(obj, *args, **kwargs):
-    obj.json_filename = kwargs.get("load_data_args", "test_data.json")
-
-
-@provider(
-    slots=[
-        SparseValueSlot(SPARSE_ID_LIMIT), DenseSlot(SPARSE_ID_LIMIT),
-        SparseNonValueSlot(SPARSE_ID_LIMIT), IndexSlot(SPARSE_ID_LIMIT),
-        StringSlot(SPARSE_ID_LIMIT)
-    ],
-    use_seq=True,
-    init_hook=processSubSeqAndGenerateDataInit)
-def processSubSeqAndGenerateData(obj, name):
-    retv_json = [sparse_value, dense, sparse_nonvalue, ids, strs]
-    retv_wrapper = [[sparse_value], [dense], [sparse_nonvalue], [ids], [strs]]
-    # Write to protoseq.
-    with open(obj.json_filename, "w") as f:
-        json.dump(retv_json, f)
-    yield retv_wrapper
-
-
-if __name__ == "__main__":
-    pvd = processSeqAndGenerateData("_")
-    print pvd.getNextBatch(100)
-    pvd = processSubSeqAndGenerateData("_")
-    print pvd.getNextBatch(1)
diff --git a/paddle/legacy/trainer/tests/test_Compare.cpp b/paddle/legacy/trainer/tests/test_Compare.cpp
deleted file mode 100644
index e37e546be..000000000
--- a/paddle/legacy/trainer/tests/test_Compare.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/PythonUtil.h>
-
-#include "paddle/legacy/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-#include <cstdlib>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile =
-    "legacy/trainer/tests/sample_trainer_config.conf";
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_string(config_args);
-
-struct comData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(bool useGpu, comData& Data) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_config = configFile;
-
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-
-  Data.parameters = trainer.getGradientMachine()->getParameters();
-  DataBatch dataBatch;
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start();
-  for (int i = 0; i < 2; ++i) {
-    trainer.getGradientMachine()->forwardBackward(
-        inArgs, &Data.outArgs, PASS_TRAIN);
-  }
-  trainer.getGradientMachine()->finish();
-}
-
-void compareGradient(comData& comDataCpu, comData& comDataGpu);
-
-TEST(Trainer, create) {
-  int devCount = 0;
-  devCount = hl_get_device_count();
-  FLAGS_config_args = "drop_rate=0";
-
-  comData comDataCpu;
-  calcGradient(false, comDataCpu);
-  LOG(INFO) << "Cpu is completed";
-
-  {
-    LOG(INFO) << "Test GPU";
-    comData comData;
-    calcGradient(true, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Gpu is completed";
-  }
-
-  {
-    LOG(INFO) << "Test test multi gpu";
-    comData comData;
-    FLAGS_trainer_count = devCount;
-    calcGradient(true, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Gpu4 is completed";
-  }
-
-  {
-    LOG(INFO) << "Test use_sparse_update=true";
-    comData comData;
-    calcGradient(false, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Cpu4 is completed";
-  }
-}
-
-double checkBuffer(real* A, real* B, size_t len) {
-#ifdef PADDLE_TYPE_DOUBLE
-  double precision = 1e-7;
-#else
-  double precision = 2e-3;
-#endif
-  int nNum = 0;
-  double maxE = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double e = fabs(A[i] - B[i]);
-    maxE = std::max(e, maxE);
-    nNum += e > precision * fabs(A[i]);
-  }
-  EXPECT_EQ(0, nNum);
-  return maxE;
-}
-
-void compareGradient(comData& comDataCpu, comData& comDataGpu) {
-  /*compare outArgs*/
-  vector<Argument> outArgs1 = comDataCpu.outArgs;
-  vector<Argument> outArgs2 = comDataGpu.outArgs;
-  CpuMatrix out1(outArgs1[0].value->getHeight(), outArgs1[0].value->getWidth());
-  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
-  out1.copyFrom(*outArgs1[0].value);
-  out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(out1.getData(), out2.getData(), out1.getElementCnt());
-
-  /*compare parameters*/
-  vector<ParameterPtr>& parameters1 = comDataCpu.parameters;
-  vector<ParameterPtr>& parameters2 = comDataGpu.parameters;
-  for (size_t i = 0; i < parameters1.size(); ++i) {
-    ParameterPtr parameter1, parameter2;
-    parameter1 = parameters1[i];
-    parameter2 = parameters2[i];
-    /*compare parameters value*/
-    CpuVector para1(parameter1->getSize());
-    CpuVector para2(parameter2->getSize());
-    para1.copyFrom(*parameter1->getBuf(PARAMETER_VALUE));
-    para2.copyFrom(*parameter2->getBuf(PARAMETER_VALUE));
-    checkBuffer(para1.getData(), para2.getData(), para1.getSize());
-
-    /*compare parameters grad*/
-    CpuVector cpuGrad1(*parameter1->getBuf(PARAMETER_GRADIENT));
-    CpuVector cpuGrad2(*parameter2->getBuf(PARAMETER_GRADIENT));
-    double e =
-        checkBuffer(cpuGrad1.getData(), cpuGrad2.getData(), cpuGrad1.getSize());
-    LOG(INFO) << parameter1->getName() << " max error=" << e;
-  }
-}
-
-int main(int argc, char** argv) {
-#ifndef PADDLE_WITH_CUDA
-  exit(0);
-#endif
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  exit(ret);
-}
diff --git a/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
deleted file mode 100644
index 847adcfab..000000000
--- a/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-#include <DataConfig.pb.h>
-#include <gtest/gtest.h>
-#include <paddle/legacy/gserver/dataproviders/DataProvider.h>
-#include <paddle/legacy/math/Matrix.h>
-#include <paddle/legacy/parameter/Argument.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <fstream>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include "picojson.h"
-
-void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
-const std::string kDir = "./legacy/trainer/tests/pydata_provider_wrapper_dir/";
-
-TEST(PyDataProviderWrapper, SequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module("testPyDataWrapper");
-  conf.set_load_data_object("processSeqAndGenerateData");
-  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-
-  picojson::value val;
-  std::fstream fin;
-  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
-  EXPECT_TRUE(fin.is_open());
-  if (fin.is_open()) {
-    std::string err = picojson::parse(val, fin);
-    EXPECT_TRUE(err.empty());
-    EXPECT_TRUE(val.is<picojson::array>());
-    picojson::array& arr = val.get<picojson::array>();
-    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
-    // CHECK Value
-    checkValue(arguments, arr);
-    // CHECK sequenceStartPositions
-    for (size_t i = 0; i < arr.size(); i++) {
-      int row_id = arr[i].get<picojson::array>().size();
-      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].sequenceStartPositions->getData(false)[1]);
-    }
-    fin.close();
-  }
-}
-
-TEST(PyDataProviderWrapper, HasSubSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module("testPyDataWrapper");
-  conf.set_load_data_object("processSubSeqAndGenerateData");
-  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(1, &batchFromPy);
-
-  picojson::value val;
-  std::fstream fin;
-  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
-  EXPECT_TRUE(fin.is_open());
-  if (fin.is_open()) {
-    std::string err = picojson::parse(val, fin);
-    EXPECT_TRUE(err.empty());
-    EXPECT_TRUE(val.is<picojson::array>());
-    picojson::array& arr = val.get<picojson::array>();
-    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
-    // CHECK Value
-    checkValue(arguments, arr);
-    // CHECK sequenceStartPositions and subSequenceStartPositions
-    for (size_t i = 0; i < arr.size(); i++) {
-      int row_id = arr[i].get<picojson::array>().size();
-      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].sequenceStartPositions->getData(false)[1]);
-      EXPECT_EQ(0, arguments[i].subSequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].subSequenceStartPositions->getData(false)[1]);
-    }
-    fin.close();
-  }
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-void checkValue(std::vector<paddle::Argument>& arguments,
-                picojson::array& arr) {
-  // CHECK SLOT 0, Sparse Value.
-  paddle::Argument& sparse_values_seq = arguments[0];
-  paddle::MatrixPtr& sparse_values_seq_rawmatrix = sparse_values_seq.value;
-  EXPECT_TRUE(sparse_values_seq_rawmatrix != nullptr);
-  paddle::CpuSparseMatrix* sparse_val_seq_sparse_mat =
-      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_values_seq_rawmatrix.get());
-  EXPECT_TRUE(sparse_val_seq_sparse_mat != nullptr);
-  EXPECT_EQ(arr.size(), arguments.size());
-  EXPECT_TRUE(arr[0].is<picojson::array>());
-  size_t row_id = 0;
-  for (picojson::value& sparse_val_seq : arr[0].get<picojson::array>()) {
-    std::unordered_map<int, real> cols;
-    for (picojson::value& kv : sparse_val_seq.get<picojson::array>()) {
-      EXPECT_TRUE(kv.get(0).is<double>());
-      EXPECT_TRUE(kv.get(1).is<double>());
-      int col = (int)(kv.get(0).get<double>());
-      real val = (real)(kv.get(1).get<double>());
-      cols.insert({col, val});
-    }
-    size_t colNum = sparse_val_seq_sparse_mat->getColNum(row_id);
-    EXPECT_EQ(cols.size(), colNum);
-    int* rowIds = sparse_val_seq_sparse_mat->getRowCols(row_id);
-    real* rowBuf = sparse_val_seq_sparse_mat->getRowValues(row_id);
-    for (size_t i = 0; i < colNum; ++i) {
-      int id = rowIds[i];
-      auto it = cols.find(id);
-      EXPECT_NE(cols.end(), it);
-      real expect = it->second;
-      EXPECT_NEAR(expect, *rowBuf, 1e-5);
-      ++rowBuf;
-    }
-    ++row_id;
-  }
-
-  // CHECK SLOT 1, Dense Value.
-  paddle::Argument& dense_arg = arguments[1];
-  paddle::MatrixPtr& dense_mat = dense_arg.value;
-  EXPECT_NE(nullptr, dense_mat);
-  EXPECT_TRUE(arr[1].is<picojson::array>());
-  row_id = 0;
-  for (picojson::value& dense_seq : arr[1].get<picojson::array>()) {
-    EXPECT_TRUE(dense_seq.is<picojson::array>());
-    picojson::array& row = dense_seq.get<picojson::array>();
-    EXPECT_EQ(row.size(), dense_mat->getWidth());
-    real* rowBuf = dense_mat->getRowBuf(row_id++);
-
-    for (picojson::value& val : row) {
-      EXPECT_TRUE(val.is<double>());
-      real expect = val.get<double>();
-      EXPECT_NEAR(expect, *rowBuf++, 1e-5);
-    }
-  }
-
-  // CHECK SLOT 2, Sparse Non Value.
-  paddle::Argument& sparse_non_val_arg = arguments[2];
-  paddle::MatrixPtr& sparse_non_val_rawm = sparse_non_val_arg.value;
-  EXPECT_NE(nullptr, sparse_non_val_rawm);
-  paddle::CpuSparseMatrix* sparse_non_val_m =
-      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_non_val_rawm.get());
-  EXPECT_NE(nullptr, sparse_non_val_m);
-  row_id = 0;
-  for (picojson::value& row : arr[2].get<picojson::array>()) {
-    EXPECT_TRUE(row.is<picojson::array>());
-    std::unordered_set<int> ids;
-    for (picojson::value& id : row.get<picojson::array>()) {
-      EXPECT_TRUE(id.is<double>());
-      ids.insert((int)(id.get<double>()));
-    }
-    size_t colNum = sparse_non_val_m->getColNum(row_id);
-    EXPECT_EQ(ids.size(), colNum);
-    for (size_t i = 0; i < colNum; ++i) {
-      int col = sparse_non_val_m->getRowCols(row_id)[i];
-      EXPECT_TRUE(ids.find(col) != ids.end());
-    }
-    ++row_id;
-  }
-
-  // CHECK SLOT 3, Index.
-  paddle::Argument& index_arg = arguments[3];
-  paddle::IVectorPtr indices = index_arg.ids;
-  EXPECT_NE(nullptr, indices);
-  int* idPtr = indices->getData();
-  for (picojson::value& id : arr[3].get<picojson::array>()) {
-    EXPECT_TRUE(id.is<double>());
-    int _id = (int)(id.get<double>());
-    EXPECT_EQ(_id, *idPtr++);
-  }
-
-  // CHECK SLOT 4, String.
-  paddle::Argument& strArg = arguments[4];
-  std::vector<std::string>* strPtr = strArg.strs.get();
-  EXPECT_NE(nullptr, strPtr);
-  size_t vecIndex = 0;
-  for (picojson::value& str : arr[4].get<picojson::array>()) {
-    EXPECT_TRUE(str.is<std::string>());
-    std::string _str = str.get<std::string>();
-    EXPECT_EQ(_str, (*strPtr)[vecIndex++]);
-  }
-}
-
-#else
-int main() { return 0; }
-
-#endif
diff --git a/paddle/legacy/trainer/tests/test_Trainer.cpp b/paddle/legacy/trainer/tests/test_Trainer.cpp
deleted file mode 100644
index 14ad0a265..000000000
--- a/paddle/legacy/trainer/tests/test_Trainer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/PythonUtil.h>
-#include <paddle/legacy/utils/Version.h>
-#include "paddle/legacy/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 =
-    "legacy/trainer/tests/sample_trainer_config.conf";
-static const string& configFile2 =
-    "legacy/trainer/tests/sample_trainer_config_hsigmoid.conf";
-static const string& configFile4 =
-    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_bool(allow_only_one_model_on_one_gpu);
-
-void checkGradientTest(const string& configFile,
-                       bool useGpu,
-                       bool parallel,
-                       int trainerCount = 1) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-  EXPECT_LE(fabs(trainer.checkGradient()), 0.02);
-}
-
-TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
-
-TEST(checkGradient, multiGpu) {
-  int numGpu;
-  numGpu = hl_get_device_count();
-  for (auto count : {2, 4}) {
-    if (count <= numGpu) {
-      checkGradientTest(configFile1, true, false, count);
-    }
-  }
-}
-
-TEST(checkGradient, parallel) {
-  if (hl_get_device_count() >= 2) {
-    checkGradientTest(configFile4, true, true);
-  }
-}
-
-TEST(checkGradient, multiParallel) {
-  FLAGS_allow_only_one_model_on_one_gpu = false;
-  checkGradientTest(configFile4, true, true, 2);
-  FLAGS_allow_only_one_model_on_one_gpu = true;
-}
-
-#endif
-
-TEST(checkGradient, multi) {
-  int numGpu;
-  if (version::isWithGpu()) {
-    numGpu = hl_get_device_count();
-  } else {
-    numGpu = 0;
-  }
-  for (bool useGpu : {false, true}) {
-    for (auto count : {2, 4}) {
-      if (useGpu && count > numGpu) continue;
-      checkGradientTest(configFile1, useGpu, false, count);
-    }
-  }
-}
-
-TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
-
-TEST(checkGradient, non_parallel) {
-  checkGradientTest(configFile4, false, false);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp b/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
deleted file mode 100644
index 3e5c5ea72..000000000
--- a/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/legacy/utils/GlobalConstants.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-#include "paddle/legacy/trainer/Trainer.h"
-#include "paddle/legacy/trainer/TrainerInternal.h"
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/pserver/ParameterServer2.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 =
-    "legacy/trainer/tests/sample_trainer_config.conf";
-static const string& configFile2 =
-    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
-
-static const string& configFileSimpleSparse =
-    "legacy/trainer/tests/simple_sparse_neural_network.py";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_int32(seed);
-DECLARE_int32(num_passes);
-DECLARE_int32(saving_period);
-
-class TrainerForTest : public paddle::Trainer {
- public:
-  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
-    return this->trainerInternal_.getParameterUpdater();
-  }
-};
-
-int gNumDevices = 0;
-
-void trainerOnePassTest(const string& configFile,
-                        bool useGpu,
-                        bool parallel,
-                        int trainerCount = 1,
-                        double averageWindow = 0.0f,
-                        bool doAverageInCpu = false) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-  srand(FLAGS_seed);
-
-  if (useGpu) {
-    if (gNumDevices < trainerCount) {
-      return;
-    }
-  }
-
-  Trainer trainer;
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  if (averageWindow > 0) {
-    config->getOptConfig().set_average_window(averageWindow);
-    config->getOptConfig().set_do_average_in_cpu(doAverageInCpu);
-  }
-  trainer.init(config);
-  trainer.train();
-}
-
-// 1. test trainer (cpu, gpu).
-TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
-
-TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
-
-TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
-
-TEST(trainerOnePass, parallel) {
-  if (hl_get_device_count() >= 2) {
-    trainerOnePassTest(configFile2, true, true);
-  }
-}
-#endif
-
-// 2. test average_window.
-#ifdef PADDLE_WITH_CUDA
-TEST(average_window, gpu) {
-  trainerOnePassTest(configFile1, true, false, 4, 0.01);
-}
-
-TEST(average_window, gpu2) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 2, 0.01);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window, gpu4) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 4, 0.01);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window_cpu, gpu2) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window_cpu, gpu4) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
-  FLAGS_num_passes = 1;
-}
-#endif
-
-// 3. test trainer + pserver.
-DECLARE_int32(num_gradient_servers);
-DECLARE_int32(port);
-DECLARE_bool(local);
-DECLARE_bool(use_old_updater);
-
-double checkRemoteParameterUpdater(TrainerForTest& trainer) {
-  auto gradientMachine = trainer.getGradientMachine();
-  auto parameterUpdater = trainer.getParameterUpdaterForTest();
-  auto dataProvider = trainer.getDataProvider();
-  auto& parameters = gradientMachine->getParameters();
-  const TrainerConfig& config = trainer.getConfig();
-  const string& alg = config.opt_config().algorithm();
-
-  vector<ParameterPtr> parameterCheck;
-  for (auto& parameter : parameters) {
-    parameterCheck.emplace_back(
-        new Parameter(parameter->getConfig(), /* useGpu= */ false));
-    parameterCheck.back()
-        ->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    parameterCheck.back()
-        ->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*parameter->getBuf(PARAMETER_GRADIENT));
-  }
-
-  std::unique_ptr<ParameterUpdater> parameterUpdaterCheck;
-  if (alg == TrainAlgorithm::SGD) {
-    parameterUpdaterCheck.reset(new SgdLocalUpdater(config.opt_config()));
-  } else {
-    LOG(INFO) << "unsupported algorithm in remote parameter check: " << alg;
-    return -1.0;
-  }
-  parameterUpdaterCheck->init(parameterCheck);
-
-  // gradientMachine->start(config, *dataProvider);
-  DataBatch dataBatch;
-  int32_t batchSize = config.opt_config().batch_size();
-  dataProvider->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  int64_t actualBatchSize = dataBatch.getSize();
-  const vector<Argument>& inArgs = dataBatch.getStreams();
-  vector<Argument> outArgs;
-
-  UpdateCallback updateCallback = [parameterUpdater,
-                                   parameterCheck](Parameter* para) {
-    parameterCheck[para->getID()]
-        ->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    parameterUpdater->update(para);
-  };
-
-  parameterUpdater->startPass();
-  parameterUpdaterCheck->startPass();
-
-  for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
-       ++i) {
-    PassType passType = parameterUpdater->startBatch(actualBatchSize);
-    gradientMachine->forwardBackward(
-        inArgs, &outArgs, passType, updateCallback);
-    parameterUpdater->finishBatch(0);
-
-    parameterUpdaterCheck->startBatch(actualBatchSize);
-    for (auto& para : parameterCheck) {
-      parameterUpdaterCheck->update(para.get());
-    }
-    parameterUpdaterCheck->finishBatch(0);
-  }
-
-  double sum = 0.0f;
-  for (size_t i = 0; i != parameters.size(); ++i) {
-    real *v1, *v2;
-    CpuVector trainerPara(parameters[i]->getSize());
-    trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
-    if (!FLAGS_use_gpu) {
-      v1 = parameters[i]->getBuf(PARAMETER_VALUE)->getData();
-    } else {
-      v1 = trainerPara.getData();
-    }
-    v2 = parameterCheck[i]->getBuf(PARAMETER_VALUE)->getData();
-
-    size_t size = parameters[i]->getSize();
-    double diff = 0;
-    for (size_t j = 0; j < size; ++j) {
-      diff += fabs(v1[j] - v2[j]);
-    }
-    sum += diff;
-    LOG(INFO) << setiosflags(ios::left) << setfill(' ') << setw(20)
-              << parameters[i]->getName() << "diff=" << setw(15) << diff;
-  }
-
-  parameterUpdater->finishPass();
-  parameterUpdaterCheck->finishPass();
-  gradientMachine->finish();
-  return sum;
-}
-
-void checkRemoteParameterUpdaterTest(const string& configFile,
-                                     bool useGpu,
-                                     bool parallel,
-                                     int trainerCount = 1,
-                                     bool useOldUpdater = false,
-                                     int num_batches_per_get_parameter = 1) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  FLAGS_use_old_updater = useOldUpdater;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-  srand(FLAGS_seed);
-
-  if (useGpu) {
-    if (gNumDevices < trainerCount) {
-      return;
-    }
-  }
-
-  FLAGS_local = 0;
-  std::shared_ptr<ParameterServer2> pserver;
-  pserver.reset(new ParameterServer2(std::string(), FLAGS_port));
-  pserver->init();
-  pserver->start();
-
-  TrainerForTest trainer;
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  config->getOptConfig().set_num_batches_per_get_parameter(
-      num_batches_per_get_parameter);
-  trainer.init(config);
-  EXPECT_EQ(checkRemoteParameterUpdater(trainer), 0);
-
-  FLAGS_local = 1;
-}
-
-TEST(checkRemoteUpdater, cpuTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false);
-}
-
-TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(checkRemoteUpdater, gpuTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false);
-}
-
-TEST(checkRemoteUpdater, gpu2Trainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 2);
-}
-
-TEST(checkRemoteUpdater, gpu4Trainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 4);
-}
-
-TEST(checkRemoteUpdater, gpuTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 1, true);
-}
-
-TEST(checkRemoteUpdater, gpu2TrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 2, true);
-}
-
-TEST(checkRemoteUpdater, gpu4TrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 4, true);
-}
-
-#endif
-
-TEST(checkRemoteUpdater, cpuDeltaTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, false, 10);
-}
-
-TEST(checkRemoteUpdater, cpuDeltaTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true, 10);
-}
-
-TEST(SgdThreadUpdater, simpleSparseNN) {
-  trainerOnePassTest(configFileSimpleSparse, false, false, 1, 0.5, true);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  initPython(argc, argv);
-  gNumDevices = hl_get_device_count();
-
-  FLAGS_num_passes = 1;          // train one pass
-  FLAGS_saving_period = 100000;  // do not save parameteres
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/trainer/tests/test_config.conf b/paddle/legacy/trainer/tests/test_config.conf
deleted file mode 100644
index bce687ad8..000000000
--- a/paddle/legacy/trainer/tests/test_config.conf
+++ /dev/null
@@ -1,77 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-    files = "legacy/trainer/tests/sample_filelist.txt",
-    feat_dim = 3,
-    context_len = 0,
-    buffer_capacity = 1000000,
-    async_load_data = False))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-wt = data_layer(name='weight', size=1)
-
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=12,
-               bias_attr=True,
-               param_attr=ParamAttr(name='sharew'),
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=3,
-               bias_attr=True,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               layer_attr=ExtraAttr(drop_rate=0.5),
-               act=SquareActivation())
-
-pool = img_pool_layer(input=fc2,
-                      pool_size=2,
-                      pool_size_y=3,
-                      num_channels=1,
-                      padding=1,
-                      padding_y=2,
-                      stride=2,
-                      stride_y=3,
-                      pool_type=CudnnAvgPooling())
-
-concat = concat_layer(input=[fc3, fc4])
-
-with mixed_layer(size=3, act=SoftmaxActivation()) as output:
-    output += full_matrix_projection(input=fc1)
-    output += trans_full_matrix_projection(input=fc2,
-                                           param_attr=ParamAttr(name='sharew'))
-    output += full_matrix_projection(input=concat)
-    output += identity_projection(input=fc3)
-
-lbl = data_layer(name='label', size=1)
-
-cost = classification_cost(input=output, label=lbl, weight=wt,
-                           layer_attr=ExtraAttr(device=-1))
-
-nce = nce_layer(input=fc2, label=lbl, weight=wt,
-                num_classes=3, 
-                neg_distribution=[0.1, 0.3, 0.6])
-                
-outputs(cost, nce)
diff --git a/paddle/legacy/trainer/tests/test_gen_dict.txt b/paddle/legacy/trainer/tests/test_gen_dict.txt
deleted file mode 100644
index 1000f9005..000000000
--- a/paddle/legacy/trainer/tests/test_gen_dict.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-0
-1
-2
-3
-4
-5
-6
-7
-8
diff --git a/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
deleted file mode 100644
index 47b4e82cd..000000000
--- a/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-
-#include <paddle/legacy/trainer/Trainer.h>
-#include <paddle/legacy/utils/PythonUtil.h>
-
-#include <gtest/gtest.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& CONFIG_FILE =
-    "legacy/trainer/tests/sample_trainer_rnn_gen.conf";
-static const string& NEST_CONFIG_FILE =
-    "legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf";
-static const string& OUTPUT_DIR = "legacy/trainer/tests/dump_text.test";
-static string modelDir =
-    "legacy/trainer/tests/rnn_gen_test_model_dir/t1";       // NOLINT
-static string expectFile =                                  // NOLINT
-    "legacy/trainer/tests/rnn_gen_test_model_dir/r1.test";  // NOLINT
-
-DECLARE_string(config_args);
-
-vector<float> readRetFile(const string& fname) {
-  ifstream inFile(fname);
-  float ret;
-  vector<float> nums;
-  while (inFile >> ret) {
-    nums.push_back(ret);
-  }
-  return nums;
-}
-
-void checkOutput(const string& expRetFile) {
-  vector<float> rets = readRetFile(OUTPUT_DIR);
-  vector<float> expRets = readRetFile(expRetFile);
-  EXPECT_EQ(rets.size(), expRets.size());
-  for (size_t i = 0; i < rets.size(); i++) {
-    EXPECT_FLOAT_EQ(rets[i], expRets[i]);
-  }
-}
-
-void prepareInArgs(vector<Argument>& inArgs,
-                   const size_t batchSize,
-                   bool useGpu,
-                   bool hasSubseq) {
-  inArgs.clear();
-  // sentence id
-  Argument sentId;
-  sentId.value = nullptr;
-  if (hasSubseq) {
-    // as there is only one sequence, there is only one label.
-    IVector::resizeOrCreate(sentId.ids, 1, useGpu);
-    sentId.ids->setElement(0, 0);
-  } else {
-    // as there is batchSize word, there is batchSize label.
-    IVector::resizeOrCreate(sentId.ids, batchSize, useGpu);
-    for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i);
-  }
-  inArgs.emplace_back(sentId);
-
-  // a dummy layer to decide batch size
-  Argument dummyInput;
-  dummyInput.value = Matrix::create(batchSize, 2, false, useGpu);
-  dummyInput.value->randomizeUniform();
-  if (hasSubseq) {
-    // generate one sequence with batchSize subsequence,
-    // and each subsequence has only one word.
-    dummyInput.sequenceStartPositions = ICpuGpuVector::create(2, false);
-    int* buf = dummyInput.sequenceStartPositions->getMutableData(false);
-    dummyInput.subSequenceStartPositions =
-        ICpuGpuVector::create(batchSize + 1, false);
-    int* subBuf = dummyInput.subSequenceStartPositions->getMutableData(false);
-    buf[0] = 0;
-    buf[1] = batchSize;
-    for (size_t i = 0; i < batchSize + 1; i++) subBuf[i] = i;
-  }
-  inArgs.emplace_back(dummyInput);
-}
-
-void testGeneration(const string& configFile,
-                    bool useGpu,
-                    bool hasSubseq,
-                    const string& expRetFile) {
-  FLAGS_use_gpu = useGpu;
-  auto config = std::make_shared<TrainerConfigHelper>(configFile);
-  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
-  gradientMachine->loadParameters(modelDir);
-  vector<Argument> inArgs(2);
-
-  const size_t batchSize = 15;
-  prepareInArgs(inArgs, batchSize, useGpu, hasSubseq);
-  vector<Argument> outArgs;
-  unique_ptr<Evaluator> testEvaluator(gradientMachine->makeEvaluator());
-  testEvaluator->start();
-  gradientMachine->forward(inArgs, &outArgs, PASS_TEST);
-  gradientMachine->eval(testEvaluator.get());
-  testEvaluator->finish();
-  checkOutput(expRetFile);
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-TEST(RecurrentGradientMachine, test_generation) {
-#ifndef PADDLE_WITH_CUDA
-  const auto useGpuConfs = {false};
-#else
-  const auto useGpuConfs = {true, false};
-#endif
-  auto testGen = [&](const string& configFile,
-                     bool hasSubseq,
-                     const string& expRetFile,
-                     bool beam_search) {
-    FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
-    for (auto useGpu : useGpuConfs) {
-      LOG(INFO) << configFile << " useGpu=" << useGpu
-                << " beam_search=" << beam_search;
-      testGeneration(configFile, useGpu, hasSubseq, expRetFile);
-    }
-  };
-  testGen(CONFIG_FILE, false, expectFile + ".nobeam", false);  // no beam search
-  testGen(CONFIG_FILE, false, expectFile + ".beam", true);     // beam search
-  // In hierarchical RNN, beam search and one way search are only in inner-RNN,
-  // outer-RNN will concat the generated inner-results (first for beam search)
-  // from inner-RNN. Thus, they have the same outer-results.
-  testGen(NEST_CONFIG_FILE,
-          true,
-          expectFile + ".nest",
-          false);  // no beam search
-  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
-}
-#endif
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  CHECK(argc == 1 || argc == 3);
-  if (argc == 3) {
-    modelDir = argv[1];
-    expectFile = argv[2];
-  }
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/legacy/utils/.gitignore b/paddle/legacy/utils/.gitignore
deleted file mode 100644
index f2cfd7409..000000000
--- a/paddle/legacy/utils/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-enable_virtualenv.c
diff --git a/paddle/legacy/utils/Any.h b/paddle/legacy/utils/Any.h
deleted file mode 100644
index 99a0139ac..000000000
--- a/paddle/legacy/utils/Any.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#if __cplusplus > 201402L
-#include <any>
-
-namespace paddle {
-// using std::any for C++ 17
-using std::any;
-using std::any_cast;
-using std::bad_any_cast;
-}  // namespace paddle
-
-#else
-#include <any.hpp>
-
-namespace paddle {
-// use linb::any for C++ 11
-using linb::any;
-using linb::any_cast;
-using linb::bad_any_cast;
-}  // namespace paddle
-#endif
diff --git a/paddle/legacy/utils/CMakeLists.txt b/paddle/legacy/utils/CMakeLists.txt
deleted file mode 100644
index b42b2bae9..000000000
--- a/paddle/legacy/utils/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-# The utilities for paddle
-file(GLOB UTIL_HEADERS . *.h)
-file(GLOB UTIL_SOURCES . *.cpp)
-create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
-  ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
-set(UTIL_RES ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
-
-if(APPLE)
-    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
-else()
-    file(GLOB UTIL_ARCH_SOURCES . arch/linux/*.cpp)
-endif()
-add_library(paddle_utils STATIC
-        ${UTIL_SOURCES}
-        ${UTIL_ARCH_SOURCES}
-        ${UTIL_RES})
-add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/legacy/utils/ClassRegistrar.h b/paddle/legacy/utils/ClassRegistrar.h
deleted file mode 100644
index 5f40a0b25..000000000
--- a/paddle/legacy/utils/ClassRegistrar.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <string>
-
-#include "Util.h"
-
-namespace paddle {
-
-/**
- * This class is used to keep a set of class types. It can register a
- * class by a type name and create an instance of a class by type.
- * Example:
- *   // Declare the registrar
- *   ClassRegistrar<Layer, LayerConfig> registar_;
- *
- *   // Register a class using its constructor
- *   registrar_.registerClass<ConvLayer>("conv");
- *
- *   // Register a class using a creation function
- *   registrar_.registerClass("pool", [](LayerConfig& config){
- *     return PoolLayer::create(config);
- *   });
- *
- *   // create a class instance by type name
- *   Layer* layer = registrar_.createByType("conv", config);
- */
-template <class BaseClass, typename... CreateArgs>
-class ClassRegistrar {
- public:
-  typedef std::function<BaseClass*(CreateArgs...)> ClassCreator;
-
-  // Register a class using a creation function.
-  // The creation function's arguments are CreateArgs
-  void registerClass(const std::string& type, ClassCreator creator) {
-    CHECK(creatorMap_.count(type) == 0) << "Duplicated class type: " << type;
-    creatorMap_[type] = creator;
-  }
-
-  // Register a class using its constructor
-  // The constructor's arguments are CreateArgs
-  template <class ClassType>
-  void registerClass(const std::string& type) {
-    registerClass(type,
-                  [](CreateArgs... args) { return new ClassType(args...); });
-  }
-
-  // Create a class instance of type @type using args
-  BaseClass* createByType(const std::string& type, CreateArgs... args) {
-    ClassCreator creator;
-    CHECK(mapGet(type, creatorMap_, &creator)) << "Unknown class type: "
-                                               << type;
-    return creator(args...);
-  }
-
-  template <typename T>
-  inline void forEachType(T callback) {
-    for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
-      callback(it->first);
-    }
-  }
-
- protected:
-  std::map<std::string, ClassCreator> creatorMap_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Common.h b/paddle/legacy/utils/Common.h
deleted file mode 100644
index 1f1d0255a..000000000
--- a/paddle/legacy/utils/Common.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Excepts.h"
-
-/**
- * Disable copy macro.
- */
-#define DISABLE_COPY(class_name)                \
-  class_name(class_name &&) = delete;           \
-  class_name(const class_name &other) = delete; \
-  class_name &operator=(const class_name &other) = delete
-
-namespace paddle {
-
-#ifdef PADDLE_TYPE_DOUBLE
-using real = double;
-#else
-using real = float;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/CpuId.cpp b/paddle/legacy/utils/CpuId.cpp
deleted file mode 100644
index 66e7c6606..000000000
--- a/paddle/legacy/utils/CpuId.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/CpuId.h"
-#include "paddle/legacy/utils/Util.h"
-
-#ifdef _WIN32
-
-#include <intrin.h>
-
-/// for MSVC
-#define CPUID(info, x) __cpuidex(info, x, 0)
-
-#else
-
-#if !defined(__arm__) && !defined(__aarch64__)
-#include <cpuid.h>
-/// for GCC/Clang
-#define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
-#endif
-
-#endif
-
-namespace paddle {
-
-SIMDFlags::SIMDFlags() {
-#if defined(__arm__) || defined(__aarch64__)
-  simd_flags_ = SIMD_NEON;
-#else
-  unsigned int cpuInfo[4];
-  // CPUID: https://en.wikipedia.org/wiki/CPUID
-  // clang-format off
-  CPUID(cpuInfo, 0x00000001);
-  simd_flags_ |= cpuInfo[3] & (1 << 25) ? SIMD_SSE   : SIMD_NONE;
-  simd_flags_ |= cpuInfo[3] & (1 << 26) ? SIMD_SSE2  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 <<  0) ? SIMD_SSE3  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 <<  9) ? SIMD_SSSE3 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 19) ? SIMD_SSE41 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 20) ? SIMD_SSE42 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 12) ? SIMD_FMA3  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 28) ? SIMD_AVX   : SIMD_NONE;
-
-  CPUID(cpuInfo, 0x00000007);
-  simd_flags_ |= cpuInfo[1] & (1 <<  5) ? SIMD_AVX2  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[1] & (1 << 16) ? SIMD_AVX512: SIMD_NONE;
-
-  CPUID(cpuInfo, 0x80000001);
-  simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
-  // clang-fotmat on
-#endif
-}
-
-SIMDFlags const* SIMDFlags::instance() {
-  static SIMDFlags instance;
-  return &instance;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/CpuId.h b/paddle/legacy/utils/CpuId.h
deleted file mode 100644
index ed58211d1..000000000
--- a/paddle/legacy/utils/CpuId.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Common.h"
-#include "Error.h"
-
-namespace paddle {
-
-// clang-format off
-enum simd_t {
-  SIMD_NONE   = 0,          ///< None
-  SIMD_SSE    = 1 << 0,     ///< SSE
-  SIMD_SSE2   = 1 << 1,     ///< SSE 2
-  SIMD_SSE3   = 1 << 2,     ///< SSE 3
-  SIMD_SSSE3  = 1 << 3,     ///< SSSE 3
-  SIMD_SSE41  = 1 << 4,     ///< SSE 4.1
-  SIMD_SSE42  = 1 << 5,     ///< SSE 4.2
-  SIMD_FMA3   = 1 << 6,     ///< FMA 3
-  SIMD_FMA4   = 1 << 7,     ///< FMA 4
-  SIMD_AVX    = 1 << 8,     ///< AVX
-  SIMD_AVX2   = 1 << 9,     ///< AVX 2
-  SIMD_AVX512 = 1 << 10,    ///< AVX 512
-  SIMD_NEON   = 1 << 11,    ///  NEON
-};
-// clang-format on
-
-class SIMDFlags final {
- public:
-  DISABLE_COPY(SIMDFlags);
-
-  SIMDFlags();
-
-  static SIMDFlags const* instance();
-
-  inline bool check(int flags) const {
-    return !((simd_flags_ & flags) ^ flags);
-  }
-
- private:
-  int simd_flags_ = SIMD_NONE;
-};
-
-/**
- * @brief   Check SIMD flags at runtime.
- *
- * For example.
- * @code{.cpp}
- *
- * if (HAS_SIMD(SIMD_AVX2 | SIMD_FMA4)) {
- *      avx2_fm4_stub();
- * } else if (HAS_SIMD(SIMD_AVX)) {
- *      avx_stub();
- * }
- *
- * @endcode
- */
-#define HAS_SIMD(__flags) SIMDFlags::instance()->check(__flags)
-
-/**
- * @brief   Check SIMD flags at runtime.
- *
- * 1. Check all SIMD flags at runtime:
- *
- * @code{.cpp}
- * if (HAS_AVX && HAS_AVX2) {
- *      avx2_stub();
- * }
- * @endcod
- *
- * 2. Check one SIMD flag at runtime:
- *
- * @code{.cpp}
- * if (HAS_SSE41 || HAS_SSE42) {
- *      sse4_stub();
- * }
- * @endcode
- */
-// clang-format off
-#define HAS_SSE     HAS_SIMD(SIMD_SSE)
-#define HAS_SSE2    HAS_SIMD(SIMD_SSE2)
-#define HAS_SSE3    HAS_SIMD(SIMD_SSE3)
-#define HAS_SSSE3   HAS_SIMD(SIMD_SSSE3)
-#define HAS_SSE41   HAS_SIMD(SIMD_SSE41)
-#define HAS_SSE42   HAS_SIMD(SIMD_SSE42)
-#define HAS_FMA3    HAS_SIMD(SIMD_FMA3)
-#define HAS_FMA4    HAS_SIMD(SIMD_FMA4)
-#define HAS_AVX     HAS_SIMD(SIMD_AVX)
-#define HAS_AVX2    HAS_SIMD(SIMD_AVX2)
-#define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
-#define HAS_NEON    HAS_SIMD(SIMD_NEON)
-// clang-format on
-
-/**
- * Invoke checkCPUFeature() before Paddle initialization to
- * check target machine whether support compiled instructions.
- * If not, simply throw out an error.
- */
-inline Error __must_check checkCPUFeature() {
-  Error err;
-#ifndef __AVX__
-  if (HAS_AVX) {
-    LOG(WARNING) << "PaddlePaddle wasn't compiled to use avx instructions, "
-                 << "but these are available on your machine and could "
-                 << "speed up CPU computations via CMAKE .. -DWITH_AVX=ON";
-  }
-#else
-  if (!HAS_AVX) {
-    err = Error(
-        "PaddlePaddle was compiled to use avx instructions, "
-        "but these aren't available on your machine, please "
-        "disable it via CMAKE .. -DWITH_AVX=OFF");
-  }
-#endif  // __AVX__
-#ifdef __SSE3__
-  if (!HAS_SSE3) {
-    err = Error(
-        "PaddlePaddle was compiled to use sse3 instructions, "
-        "which is the minimum requirement of PaddlePaddle. "
-        "But these aren't available on your current machine.");
-  }
-#endif  // __SSE3__
-
-  return err;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/CustomStackTrace.cpp b/paddle/legacy/utils/CustomStackTrace.cpp
deleted file mode 100644
index 9723d7df9..000000000
--- a/paddle/legacy/utils/CustomStackTrace.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CustomStackTrace.h"
-#include <gflags/gflags.h>
-#include <iostream>
-
-DEFINE_bool(
-    layer_stack_error_only_current_thread,
-    true,
-    "Dump current thread or whole process layer stack when signal error "
-    "occurred. true means only dump current thread layer stack");
-
-namespace paddle {
-
-CustomStackTrace<std::string> gLayerStackTrace;
-
-static std::mutex gLayerStackTraceMtx;
-void installLayerStackTracer() {
-  logging::installFailureWriter([](const char* data, int sz) {
-    std::lock_guard<std::mutex> guard(gLayerStackTraceMtx);
-    if (!gLayerStackTrace.empty()) {
-      size_t curTid = -1UL;
-      std::hash<std::thread::id> hasher;
-      gLayerStackTrace.dump(
-          [&curTid, &hasher](std::thread::id tid,
-                             bool* isForwarding,
-                             const std::string& layerName) {
-            if (curTid != hasher(tid)) {
-              if (curTid != -1UL) {
-                std::cerr << std::endl;
-              }
-              curTid = hasher(tid);
-              std::cerr << "Thread [" << tid << "] ";
-              if (isForwarding) {
-                std::cerr << (*isForwarding ? "Forwarding " : "Backwarding ");
-              }
-            }
-            std::cerr << layerName << ", ";
-          },
-          FLAGS_layer_stack_error_only_current_thread);
-      std::cerr << std::endl;
-    }
-    std::cerr.write(data, sz);
-  });
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/CustomStackTrace.h b/paddle/legacy/utils/CustomStackTrace.h
deleted file mode 100644
index b60077ea2..000000000
--- a/paddle/legacy/utils/CustomStackTrace.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <stack>
-#include <thread>
-#include <unordered_map>
-
-#include "ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * A ThreadLocal stack for tracing train/test process.
- * (More details of ThreadLocal can be find
- * in the comments of ThreadLocal class.)
- *
- * For example.
- * @code{.cpp}
- *
- * paddle::CustomStackTrace<std::string> stack;
- * for (auto& layer : layers){
- *   stack.push(layer->getName());
- *   layer->forward();
- * }
- *
- * stack.pop("");  // mark under pop stage.
- *
- * for (auto it = layers.rbegin(); it != layers.rend(); ++it){
- *   auto& layer = *it;
- *   layer->backward(passType);
- *   stack.pop(layer->getName());
- * }
- *
- * @endcode
- */
-template <typename T>
-class CustomStackTrace {
- public:
-  /**
-   * @brief Pop out an item from the top of the stack if item == top.
-   *        Else, just set status to popping.
-   */
-  void pop(const T& item) {
-    auto& s = this->stack();
-    if (item == s.top()) {
-      s.pop();
-    }
-  }
-
-  /**
-   * @brief Indicate whether we are at forward or backward stage of computation
-   */
-  void set_stage(bool isForward) { pushing() = isForward; }
-
-  /**
-   * @brief clear current thread stack.
-   */
-  void clear() {
-    auto& s = stack();
-    while (!s.empty()) {
-      s.pop();
-    }
-  }
-
-  /**
-   * @brief return true if all thread's stack is empty.
-   * @return true if empty
-   */
-  bool empty() const {
-    std::lock_guard<std::mutex> g(this->mtx_);
-    for (auto p : this->stackBuffers_) {
-      std::stack<T>& s = *p.second;
-      if (!s.empty()) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /**
-   * @brief DumpCallback Type. It will be invoked many times by dump method.
-   *
-   * The first parameter is stack thread id.
-   * The second parameter is the last action of stack is push or not.
-   * The third parameter is the item in stack.
-   */
-  typedef std::function<void(const std::thread::id& /*threadId*/,
-                             bool* /*isPushing*/,
-                             const T& /*item*/)>
-      DumpCallback;
-
-  /**
-   * Dump all thread stack, and all stack will be cleared.
-   */
-  void dump(const DumpCallback& callback, bool onlyCurrentThread = false) {
-    std::lock_guard<std::mutex> g(this->mtx_);
-    for (auto p : this->stackBuffers_) {
-      std::thread::id tid = p.first;
-      if (onlyCurrentThread && tid != std::this_thread::get_id()) {
-        continue;
-      }
-      std::stack<T>& s = *p.second;
-      bool* isPush = nullptr;
-      auto it = this->pushingBuffers_.find(tid);
-      if (it != this->pushingBuffers_.end()) {
-        isPush = it->second;
-      }
-
-      while (!s.empty()) {
-        callback(tid, isPush, s.top());
-        s.pop();
-      }
-    }
-  }
-
-  /**
-   * @brief Push item to current thread stack.
-   */
-  void push(const T& item) {
-    pushing() = true;
-    auto& p = this->stack();
-    p.push(item);
-  }
-
- private:
-  /**
-   * Get thread local attribute, and save them into a map (threadId => TYPE*)
-   *
-   * @tparam TYPE thread local attribute type.
-   * @param threadLocal Thread Local object.
-   * @param buffers a map from threadId to TYPE*
-   */
-  template <typename TYPE>
-  inline TYPE& getThreadLocal(
-      ThreadLocal<TYPE>& threadLocal,
-      std::unordered_map<std::thread::id, TYPE*>& buffers) {
-    TYPE* retv = threadLocal.get(false);
-    if (retv) {
-      return *retv;
-    } else {
-      std::lock_guard<std::mutex> guard(this->mtx_);
-      retv = threadLocal.get();
-      auto id = std::this_thread::get_id();
-      buffers.insert({id, retv});
-      return *retv;
-    }
-  }
-
-  /**
-   * @brief Get thread local stack reference.
-   */
-  std::stack<T>& stack() {
-    return this->getThreadLocal(this->logStack_, this->stackBuffers_);
-  }
-
-  /**
-   * @brief Get thread local pushing flag.
-   */
-  bool& pushing() {
-    return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
-  }
-
- private:
-  mutable std::mutex mtx_;
-
-  std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
-  std::unordered_map<std::thread::id, bool*> pushingBuffers_;
-  ThreadLocal<bool> isPushing_;
-  ThreadLocal<std::stack<T>> logStack_;
-};
-
-extern CustomStackTrace<std::string> gLayerStackTrace;
-
-/**
- * @brief Install a failure handler to print layer stack when error.
- */
-extern void installLayerStackTracer();
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/DynamicLoader.cpp b/paddle/legacy/utils/DynamicLoader.cpp
deleted file mode 100644
index 9ac4a56c6..000000000
--- a/paddle/legacy/utils/DynamicLoader.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DynamicLoader.h"
-#include <gflags/gflags.h>
-#include "Logging.h"
-
-DEFINE_string(cudnn_dir,
-              "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
-
-DEFINE_string(cuda_dir,
-              "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
-
-DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
-
-DEFINE_string(tensorrt_dir, "", "Specify path for loading libnvinfer.so.");
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-  // directory separator
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
-                                               void** dso_handle,
-                                               int dynload_flags) {
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == *dso_handle) {
-    dso_path = join("/usr/local/cuda/lib/", dso_path);
-    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    if (nullptr == *dso_handle) {
-      if (dso_path == "libcudnn.dylib") {
-        LOG(FATAL)
-            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
-            << "For instance, sudo tar -xzf "
-               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
-            << "/usr/local \n sudo chmod a+r "
-               "/usr/local/cuda/include/cudnn.h "  // NOLINT
-            << "/usr/local/cuda/lib/libcudnn*";
-      }
-    }
-  }
-#endif
-}
-
-static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
-                                              const std::string& dso_name,
-                                              void** dso_handle) {
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-  *dso_handle = nullptr;
-
-  std::string dlPath = dso_name;
-  if (search_root.empty()) {
-    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-  } else {
-    // search xxx.so from custom path
-    dlPath = join(search_root, dso_name);
-    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-    // if not found, search from default path
-    if (nullptr == *dso_handle) {
-      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
-                   << dlerror() << ")";
-      dlPath = dso_name;
-      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-    }
-  }
-
-  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
-                                << " (" << dlerror() << ") \n"
-                                << "Please specify its path correctly using "
-                                   "following ways: \n"
-
-                                << "Method. set environment variable "
-                                   "LD_LIBRARY_PATH on Linux or "
-                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
-                                << "For instance, issue command: export "
-                                   "LD_LIBRARY_PATH=... \n"
-
-                                << "Note: After Mac OS 10.11, using the "
-                                   "DYLD_LIBRARY_PATH is impossible "
-                                << "unless System Integrity Protection (SIP) "
-                                   "is disabled.";
-}
-
-void GetCublasDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
-#endif
-}
-
-void GetCudnnDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
-#endif
-}
-
-void GetCurandDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
-#endif
-}
-
-void GetWarpCTCDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
-#endif
-}
-
-void GetLapackDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
-#endif
-}
-
-void GetTensorRtDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(
-      FLAGS_tensorrt_dir, "libnvinfer.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so", dso_handle);
-#endif
-}
diff --git a/paddle/legacy/utils/DynamicLoader.h b/paddle/legacy/utils/DynamicLoader.h
deleted file mode 100644
index 02f519de4..000000000
--- a/paddle/legacy/utils/DynamicLoader.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <dlfcn.h>
-#include <memory>
-#include <mutex>
-#include <string>
-
-/**
- * @brief    load the DSO of CUBLAS
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCublasDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CUDNN
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCudnnDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CURAND
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCurandDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of warp-ctc
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetWarpCTCDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of lapack
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetLapackDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of tensorrt
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetTensorRtDsoHandle(void** dso_handle);
diff --git a/paddle/legacy/utils/Error.h b/paddle/legacy/utils/Error.h
deleted file mode 100644
index 1fc8482e3..000000000
--- a/paddle/legacy/utils/Error.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-/**
- * __must_check macro. It make the function's return value must be used,
- * otherwise it will raise a compile warning. And also Paddle treat all compile
- * warnings as errors.
- */
-#ifdef __GNUC__
-#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400
-#define __must_check __attribute__((warn_unused_result))
-#else
-#define __must_check
-#endif
-#else
-#define __must_check
-#endif
-
-namespace paddle {
-
-/**
- * Error is Paddle error code. It only contain a std::string as error message.
- *
- *
- * There are two styles to return error in Paddle.
- *
- * 1. Return Error
- *    When method return a status, the return must use `__must_check` attribute.
- *    Example as below.
- * @code{cpp}
- * Error __must_check foo();
- *
- * Error __must_check bar() {
- *   // do something.
- *   Error err = foo();  // invoke other method return status.
- *   if (err) return err;
- *   // do something else.
- *   return Error();
- * }
- * @endcode{cpp}
- *
- * 2. Return by parameter.
- *    It is another way to return an error, by using a pointer parameter.
- *    Example as below.
- *
- * @code{cpp}
- * Error bar();
- *
- * int foo(Error* error) {
- *   // Do something.
- *   Error err = bar();
- *   if (err) {
- *     *error = s;
- *     return 0;
- *   }
- *   // Do something else.
- *   if (someInternalErrorHappend) {
- *     *error = Error("Some dimension is too large, %d", dimension);
- *     return 0;
- *   }
- *   // End of method.
- *   return someValue;
- * }
- *
- * Error foobar() {
- *   Error err;
- *   // do something.
- *   foo(&err);
- *   if (err) return err;
- * }
- * @endcode{cpp}
- *
- *
- * Currently there is a helper method 'check' in status, because Paddle always
- * use log(FATAL) or CHECK to make program exit before. When we clean all
- * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
- */
-class Error {
- public:
-  /**
-   * Construct a no-error value.
-   */
-  Error() {}
-
-  /**
-   * @brief Create an Error use printf syntax.
-   */
-  explicit Error(const char* fmt, ...) {
-    va_list ap;
-    va_start(ap, fmt);
-    constexpr size_t kBufferSize = 1024;
-    char buffer[kBufferSize];
-    vsnprintf(buffer, kBufferSize, fmt, ap);
-    this->msg_.reset(new std::string(buffer));
-    va_end(ap);
-  }
-
-  /**
-   * @brief msg will return the error message. If no error, return nullptr.
-   */
-  const char* msg() const {
-    if (msg_) {
-      return msg_->c_str();
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief check this status by glog.
-   * @note It is a temp method used during cleaning Paddle code. It will be
-   *       removed later.
-   */
-  void check() const { CHECK(this->isOK()) << msg(); }
-
-  /**
-   * @brief isOK return True if there is no error.
-   * @return True if no error.
-   */
-  bool isOK() const { return msg_ == nullptr; }
-
- private:
-  std::shared_ptr<std::string> msg_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Excepts.h b/paddle/legacy/utils/Excepts.h
deleted file mode 100644
index 5c2c504f5..000000000
--- a/paddle/legacy/utils/Excepts.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef EXCEPTS_H_
-#define EXCEPTS_H_
-
-#include <fenv.h>
-
-#if defined(__APPLE__) || defined(__OSX__)
-
-int fegetexcept(void);
-int feenableexcept(unsigned int excepts);
-int fedisableexcept(unsigned int excepts);
-
-#endif
-
-#endif  // EXCEPTS_H_
diff --git a/paddle/legacy/utils/Flags.cpp b/paddle/legacy/utils/Flags.cpp
deleted file mode 100644
index ea47cf23e..000000000
--- a/paddle/legacy/utils/Flags.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Flags.h"
-
-#ifndef PADDLE_WITH_CUDA
-DEFINE_bool(use_gpu, false, "Only support CPU training");
-#else
-DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-// TODO(TJ): change to true when MKLDNN layers support multi-inputs
-DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
-#else
-DEFINE_bool(use_mkldnn, false, "Only support CPU training");
-#endif
-
-#ifdef PADDLE_WITH_MKLML
-// TODO(TJ): change to true when fully confirmed
-DEFINE_bool(use_mkl_packed, false, "Whether to use MKL Packed Optimization");
-#else
-DEFINE_bool(use_mkl_packed, false, "Not to use MKL Packed Optimization");
-#endif
-
-DEFINE_bool(parallel_nn,
-            false,
-            "Whether to use multi-threads to calculate one neural network."
-            "If it was set false, use gpu_id specify which gpu core to use"
-            "(the device property in the trainer config file will be ingored)."
-            "If it was set true, the gpu core is specified by the trainer"
-            "  config file(gpu_id will be ignored).");
-DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
-DEFINE_int32(gpu_id, 0, "Which gpu core to use");
-DEFINE_int32(port, 20134, "Listening port for pserver");
-DEFINE_int32(ports_num,
-             1,
-             "Number of ports for sending dense parameter,"
-             " following ports on parameter server will be visited"
-             " for sending dense parameter: [port, port+ports_num-1]");
-DEFINE_int32(ports_num_for_sparse,
-             0,
-             "Number of ports for sending sparse parameter,"
-             " following ports on parameter server will be visited"
-             " for sending sparse parameter:"
-             " [port+ports_num, port+ports_num+ports_num_for_sparse-1]");
-DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
-DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
-DEFINE_int32(trainer_id,
-             0,
-             "For distributed training, each trainer must be given an unique id"
-             " ranging from 0 to num_trainers-1. Trainer 0 is the master"
-             " trainer");
-DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
-DEFINE_string(comment, "", "A string for commenting this training task");
-DEFINE_string(load_missing_parameter_strategy,
-              "fail",
-              "which operation to take on load model fails. support "
-              "fail/rand/zero only.");
-DEFINE_int32(log_period, 100, "Log progress every so many batches");
-DEFINE_int32(log_period_server,
-             500,
-             "Log progress every so many batches at pserver end");
-DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
-DEFINE_int32(enable_parallel_vector, 0, "threshold for enable parallel vector");
-DEFINE_bool(loadsave_parameters_in_pserver,
-            false,
-            "load and save parameters in pserver. "
-            "only work while parameter set sparse_remote_update.");
-DEFINE_int32(beam_size,
-             1,
-             "Beam size used in generating most probable output sequences.");
-
-DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
-DEFINE_string(predict_file, "", "File name for saving predict result");
-DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
-DEFINE_string(init_model_path,
-              "",
-              "Path of the initial model parameters."
-              "If it was set, start_pass will be ignored.");
diff --git a/paddle/legacy/utils/Flags.h b/paddle/legacy/utils/Flags.h
deleted file mode 100644
index b64295bca..000000000
--- a/paddle/legacy/utils/Flags.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <gflags/gflags.h>
-
-DECLARE_bool(parallel_nn);
-DECLARE_int32(async_count);
-DECLARE_int32(port);
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_int32(trainer_count);
-DECLARE_int32(ports_num);
-DECLARE_int32(ports_num_for_sparse);
-DECLARE_string(nics);
-DECLARE_string(rdma_tcp);
-DECLARE_int32(trainer_id);
-DECLARE_int32(num_gradient_servers);
-DECLARE_string(comment);
-DECLARE_string(load_missing_parameter_strategy);
-DECLARE_int32(log_period);
-DECLARE_int32(log_period_server);
-DECLARE_double(checkgrad_eps);
-DECLARE_int32(enable_parallel_vector);
-DECLARE_bool(loadsave_parameters_in_pserver);
-DECLARE_int32(beam_size);
-DECLARE_bool(show_layer_stat);
-DECLARE_string(predict_file);
-DECLARE_bool(prev_batch_state);
-DECLARE_string(init_model_path);
-DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkl_packed);
diff --git a/paddle/legacy/utils/GlobalConstants.cpp b/paddle/legacy/utils/GlobalConstants.cpp
deleted file mode 100644
index 9e8dade0b..000000000
--- a/paddle/legacy/utils/GlobalConstants.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GlobalConstants.h"
-
-namespace paddle {
-
-const std::string TrainAlgorithm::SGD = "sgd";
-const std::string TrainAlgorithm::AsyncSGD = "async_sgd";
-const std::string TrainAlgorithm::OWLQN = "owlqn";
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/GlobalConstants.h b/paddle/legacy/utils/GlobalConstants.h
deleted file mode 100644
index 3f45e8226..000000000
--- a/paddle/legacy/utils/GlobalConstants.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-namespace paddle {
-
-namespace enumeration_wrapper {
-enum PassType {
-  PASS_TRAIN,   // Train pass
-  PASS_TEST,    // Test pass
-  PASS_GC,      // Gradient Check pass
-  PASS_METRIC,  // pass for generate template output with no drop rate.
-};
-
-enum ParameterType {
-  PARAMETER_VALUE = 0,
-  PARAMETER_GRADIENT,
-  PARAMETER_MOMENTUM,
-
-  // Used by ParameterAverager
-  PARAMETER_SUM1,
-  PARAMETER_SUM2,
-  PARAMETER_SUM3,
-
-  //   also used by AdagradParameterUpdater/AdadeltaParameterUpdater
-  PARAMETER_LEARNING_RATE,
-
-  // Used by Sparse SGD update
-  PARAMETER_UPDATE_TIME,
-
-  // Used by async_sgd
-  // Change of the parameter since last remote update
-  PARAMETER_DELTA,
-
-  // Used by BatchRemoteParameterUpdater
-  PARAMETER_GRADIENT_SUM,
-
-  // Used by AdagradParameterUpdater/AdadeltaParameterUpdater
-  PARAMETER_GRADIENT_SQURESUM,
-  PARAMETER_GRADIENT_SQURESUM1,
-
-  // Used by SparseConnected layer
-  PARAMETER_ROWS,
-  PARAMETER_COLS,
-
-  // Used by Adam Optimizer.
-  PARAMETER_SECOND_MOMENTUM,
-
-  // Used By AdaMax Optimizer.
-  PARAMETER_WEIGHTED_INFINITY_NORM,
-
-  // Used by remote parameter average
-  PARAMETER_APPLY,
-
-  // Used by sparse momentum
-  PARAMETER_MOMENTUM_UT,
-  PARAMETER_MOMENTUM_VT,
-
-  NUM_PARAMETER_TYPES,
-};
-
-}  // namespace enumeration_wrapper
-
-//! explicit import enum into paddle namespace.
-using namespace enumeration_wrapper;  // NOLINT
-
-class TrainAlgorithm {
- public:
-  static const std::string SGD;
-  static const std::string AsyncSGD;
-  static const std::string OWLQN;
-
-  static inline bool isValid(const std::string& algo) {
-    return algo == SGD || algo == AsyncSGD || algo == OWLQN;
-  }
-};
-
-#ifdef __AVX__
-const int ALIGN_HINT = 32;
-#else
-const int ALIGN_HINT = 16;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Locks.h b/paddle/legacy/utils/Locks.h
deleted file mode 100644
index 65f983685..000000000
--- a/paddle/legacy/utils/Locks.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <pthread.h>
-#include <sys/time.h>
-#include <condition_variable>
-#include <mutex>
-
-#include "Common.h"
-
-namespace paddle {
-
-/**
- * A simple read-write lock.
- * The RWlock allows a number of readers or at most one writer
- * at any point in time.
- * The RWlock disable copy.
- *
- * Lock:
- *
- * Use lock() to lock on write mode, no other thread can get it
- * until unlock.
- *
- * Use lock_shared() to lock on read mode, other thread can get
- * it by using the same method lock_shared().
- *
- * Unlock:
- *
- * Use unlock() to unlock the lock.
- */
-class RWLock {
- public:
-  RWLock() { pthread_rwlock_init(&rwlock_, NULL); }
-  ~RWLock() { pthread_rwlock_destroy(&rwlock_); }
-  RWLock(const RWLock&) = delete;
-  RWLock& operator=(const RWLock&) = delete;
-
-  /**
-   * @brief lock on write mode.
-   * @note the method will block the thread, if failed to get the lock.
-   */
-  // std::mutex interface
-  void lock() { pthread_rwlock_wrlock(&rwlock_); }
-  /**
-   * @brief lock on read mode.
-   * @note if another thread is writing, it can't get the lock,
-   * and will block the thread.
-   */
-  void lock_shared() { pthread_rwlock_rdlock(&rwlock_); }
-  void unlock() { pthread_rwlock_unlock(&rwlock_); }
-
- protected:
-  pthread_rwlock_t rwlock_;
-};
-
-/**
- * The ReadLockGuard is a read mode RWLock
- * using RAII management mechanism.
- */
-class ReadLockGuard {
- public:
-  /**
-   * @brief Construct Function. Lock on rwlock in read mode.
-   */
-  explicit ReadLockGuard(RWLock& rwlock) : rwlock_(&rwlock) {
-    rwlock_->lock_shared();
-  }
-
-  /**
-   * @brief Destruct Function.
-   * @note This method just unlock the read mode rwlock,
-   * won't destroy the lock.
-   */
-  ~ReadLockGuard() { rwlock_->unlock(); }
-
- protected:
-  RWLock* rwlock_;
-};
-
-/**
- * A simple wrapper for spin lock.
- * The lock() method of SpinLock is busy-waiting
- * which means it will keep trying to lock until lock on successfully.
- * The SpinLock disable copy.
- */
-class SpinLockPrivate;
-class SpinLock {
- public:
-  DISABLE_COPY(SpinLock);
-  SpinLock();
-  ~SpinLock();
-
-  // std::mutext interface
-  void lock();
-  void unlock();
-
- private:
-  SpinLockPrivate* m;
-};
-
-/**
- * A simple wapper of semaphore which can only be shared in the same process.
- */
-class SemaphorePrivate;
-class Semaphore {
- public:
-  //! Disable copy & assign
-  Semaphore(const Semaphore& other) = delete;
-  Semaphore& operator=(const Semaphore&& other) = delete;
-
-  //! Enable move.
-  Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
-
- public:
-  /**
-   * @brief Construct Function.
-   * @param[in] initValue the initial value of the
-   * semaphore, default 0.
-   */
-  explicit Semaphore(int initValue = 0);
-
-  ~Semaphore();
-
-  /**
-   * @brief The same as wait(), except if the decrement can not
-   * be performed until ts return false install of blocking.
-   * @param[in] ts an absolute timeout in seconds and nanoseconds
-   * since the Epoch 1970-01-01 00:00:00 +0000(UTC).
-   * @return ture if the decrement proceeds before ts,
-   * else return false.
-   */
-  bool timeWait(struct timespec* ts);
-
-  /**
-   * @brief decrement the semaphore. If the semaphore's value is 0, then call
-   * blocks.
-   */
-  void wait();
-
-  /**
-   * @brief increment the semaphore. If the semaphore's value
-   * greater than 0, wake up a thread blocked in wait().
-   */
-  void post();
-
- private:
-  SemaphorePrivate* m;
-};
-
-/**
- * A simple wrapper of thread barrier.
- * The ThreadBarrier disable copy.
- */
-class ThreadBarrierPrivate;
-class ThreadBarrier {
- public:
-  DISABLE_COPY(ThreadBarrier);
-
-  /**
-   * @brief Construct Function. Initialize the barrier should
-   * wait for count threads in wait().
-   */
-  explicit ThreadBarrier(int count);
-  ~ThreadBarrier();
-
-  /**
-   * @brief .
-   * If there were count - 1 threads waiting before,
-   * then wake up all the count - 1 threads and continue run together.
-   * Else block the thread until waked by other thread .
-   */
-  void wait();
-
- private:
-  ThreadBarrierPrivate* m;
-};
-
-/**
- * A wrapper for condition variable with mutex.
- */
-class LockedCondition : public std::condition_variable {
- public:
-  /**
-   * @brief execute op and notify one thread which was blocked.
-   * @param[in] op a thread can do something in op before notify.
-   */
-  template <class Op>
-  void notify_one(Op op) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    op();
-    std::condition_variable::notify_one();
-  }
-
-  /**
-   * @brief execute op and notify all the threads which were blocked.
-   * @param[in] op a thread can do something in op before notify.
-   */
-  template <class Op>
-  void notify_all(Op op) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    op();
-    std::condition_variable::notify_all();
-  }
-
-  /**
-   * @brief wait until pred return ture.
-   * @tparam Predicate c++ concepts, describes a function object
-   * that takes a single iterator argument
-   * that is dereferenced and used to
-   * return a value testable as a bool.
-   * @note pred shall not apply any non-constant function
-   * through the dereferenced iterator.
-   */
-  template <class Predicate>
-  void wait(Predicate pred) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    std::condition_variable::wait(lock, pred);
-  }
-
-  /**
-   * @brief get mutex.
-   */
-  std::mutex* mutex() { return &mutex_; }
-
- protected:
-  std::mutex mutex_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Logging.cpp b/paddle/legacy/utils/Logging.cpp
deleted file mode 100644
index ea96bad24..000000000
--- a/paddle/legacy/utils/Logging.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Basically from tensorflow/core/platform/default/logging.cc
- * Used in embedded system where there is no glogs.
- */
-
-#include "Logging.h"
-#include <cstdlib>
-
-namespace paddle {
-
-void initializeLogging(int argc, char** argv) {
-  (void)(argc);
-  if (!getenv("GLOG_logtostderr")) {
-    google::LogToStderr();
-  }
-  google::InstallFailureSignalHandler();
-  google::InitGoogleLogging(argv[0]);
-}
-
-namespace logging {
-
-void setMinLogLevel(int level) { FLAGS_minloglevel = level; }
-
-void installFailureFunction(void (*callback)()) {
-  google::InstallFailureFunction(callback);
-}
-
-void installFailureWriter(void (*callback)(const char*, int)) {
-  google::InstallFailureWriter(callback);
-}
-
-}  // namespace logging
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Logging.h b/paddle/legacy/utils/Logging.h
deleted file mode 100644
index d9e551f08..000000000
--- a/paddle/legacy/utils/Logging.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Basically from tensorflow/core/platform/default/logging.h
- * Used in embedded system where there is no glogs.
- */
-
-#pragma once
-#include <memory>
-#include <sstream>
-#include <string>
-
-#include <glog/logging.h>
-namespace paddle {
-
-void initializeLogging(int argc, char** argv);
-
-namespace logging {
-
-void setMinLogLevel(int level);
-
-void installFailureFunction(void (*callback)());
-
-void installFailureWriter(void (*callback)(const char*, int));
-
-}  // namespace logging
-}  // namespace paddle
-
-#ifndef NDEBUG
-#define DEBUG_LEVEL 5
-#define DBG VLOG(DEBUG_LEVEL)
-#else
-#define DBG DLOG(INFO)
-#endif
diff --git a/paddle/legacy/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp
deleted file mode 100644
index 21ed049c4..000000000
--- a/paddle/legacy/utils/PythonUtil.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PythonUtil.h"
-#include <signal.h>
-#include <sstream>
-
-namespace paddle {
-
-#ifdef PADDLE_NO_PYTHON
-
-DEFINE_string(python_path, "", "python path");
-DEFINE_string(python_bin, "python2.7", "python bin");
-
-constexpr int kExecuteCMDBufLength = 204800;
-
-int executeCMD(const char* cmd, char* result) {
-  char bufPs[kExecuteCMDBufLength];
-  char ps[kExecuteCMDBufLength] = {0};
-  FILE* ptr;
-  strncpy(ps, cmd, kExecuteCMDBufLength);
-  if ((ptr = popen(ps, "r")) != NULL) {
-    size_t count = fread(bufPs, 1, kExecuteCMDBufLength, ptr);
-    memcpy(result,
-           bufPs,
-           count - 1);  // why count-1: remove the '\n' at the end
-    result[count] = 0;
-    pclose(ptr);
-    ptr = NULL;
-    return count - 1;
-  } else {
-    LOG(FATAL) << "popen failed";
-    return -1;
-  }
-}
-
-std::string callPythonFunc(const std::string& moduleName,
-                           const std::string& funcName,
-                           const std::vector<std::string>& args) {
-  std::string pythonLibPath = "";
-  std::string pythonBinPath = "";
-  if (!FLAGS_python_path.empty()) {
-    pythonLibPath = FLAGS_python_path + "/lib:";
-    pythonBinPath = FLAGS_python_path + "/bin/";
-  }
-  std::string s = "LD_LIBRARY_PATH=" + pythonLibPath + "$LD_LIBRARY_PATH " +
-                  pythonBinPath + std::string(FLAGS_python_bin) +
-                  " -c 'import " + moduleName + "\n" + "print " + moduleName +
-                  "." + funcName + "(";
-  for (auto& arg : args) {
-    s = s + "\"" + arg + "\", ";
-  }
-  s += ")'";
-  char result[kExecuteCMDBufLength] = {0};
-  LOG(INFO) << " cmd string: " << s;
-  int length = executeCMD(s.c_str(), result);
-  CHECK_NE(-1, length);
-  return std::string(result, length);
-}
-
-#else
-
-static std::recursive_mutex g_pyMutex;
-
-PyGuard::PyGuard() : guard_(g_pyMutex) {}
-
-static void printPyErrorStack(std::ostream& os,
-                              bool withEndl = false,
-                              bool withPyPath = true) {
-  PyObject *ptype, *pvalue, *ptraceback;
-  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
-  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
-  PyErr_Clear();
-  if (withPyPath) {
-    os << "Current PYTHONPATH: " << py::repr(PySys_GetObject(strdup("path")));
-    if (withEndl) {
-      os << std::endl;
-    }
-  }
-  PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
-
-  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) << " : "
-     << (pvalue == NULL ? "" : PyString_AsString(PyObject_Str(pvalue)));
-  if (withEndl) {
-    os << std::endl;
-  }
-  os << "Python Callstack: ";
-  if (withEndl) {
-    os << std::endl;
-  }
-  while (obj != NULL) {
-    int line = obj->tb_lineno;
-    const char* filename =
-        PyString_AsString(obj->tb_frame->f_code->co_filename);
-    os << "            " << filename << " : " << line;
-    if (withEndl) {
-      os << std::endl;
-    }
-    obj = obj->tb_next;
-  }
-
-  Py_XDECREF(ptype);
-  Py_XDECREF(pvalue);
-  Py_XDECREF(ptraceback);
-}
-PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
-                                   const std::string& funcName,
-                                   const std::vector<std::string>& args) {
-  PyGuard guard;
-  PyObjectPtr pyModule = py::import(moduleName);
-  PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str()));
-  CHECK_PY(pyFunc) << "GetAttrString failed.";
-  PyObjectPtr pyArgs(PyTuple_New(args.size()));
-  for (size_t i = 0; i < args.size(); ++i) {
-    PyObjectPtr pyArg(PyString_FromString(args[i].c_str()));
-    CHECK_PY(pyArg) << "Import pyArg failed.";
-    PyTuple_SetItem(pyArgs.get(), i, pyArg.release());  //  Maybe a problem
-  }
-  PyObjectPtr ret(PyObject_CallObject(pyFunc.get(), pyArgs.get()));
-  CHECK_PY(ret) << "Call Object failed.";
-  return ret;
-}
-
-std::string callPythonFunc(const std::string& moduleName,
-                           const std::string& funcName,
-                           const std::vector<std::string>& args) {
-  PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args);
-#if PY_MAJOR_VERSION >= 3
-  Py_ssize_t str_size = 0u;
-  const char* str = PyUnicode_AsUTF8AndSize(obj.get(), &str_size);
-  return std::string(str, (size_t)str_size);
-#else
-  return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-#endif  // PY_MAJOR_VERSION >= 3
-}
-
-PyObjectPtr createPythonClass(
-    const std::string& moduleName,
-    const std::string& className,
-    const std::vector<std::string>& args,
-    const std::map<std::string, std::string>& kwargs) {
-  PyGuard guard;
-  PyObjectPtr pyModule = py::import(moduleName);
-  LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
-  CHECK_PY(pyModule) << "Import module " << moduleName << " failed.";
-  PyObjectPtr pyDict(PyModule_GetDict(pyModule.get()));
-  CHECK_PY(pyDict) << "Get Dict failed.";
-  PyObjectPtr pyClass(PyDict_GetItemString(pyDict.get(), className.c_str()));
-  LOG(INFO) << "createPythonClass className.c_str():" << className.c_str();
-  CHECK_PY(pyClass) << "Import class " << className << " failed.";
-  PyObjectPtr argsObjectList(PyTuple_New(args.size()));
-  for (size_t i = 0; i < args.size(); ++i) {
-    PyObjectPtr pyArg(Py_BuildValue("s#", args[i].c_str(), args[i].length()));
-    PyTuple_SetItem(argsObjectList.get(), i, pyArg.release());
-  }
-
-  PyObjectPtr kwargsObjectList(PyDict_New());
-  for (auto& x : kwargs) {
-    PyObjectPtr pyArg(Py_BuildValue("s#", x.second.c_str(), x.second.length()));
-    PyDict_SetItemString(
-        kwargsObjectList.get(), x.first.c_str(), pyArg.release());
-  }
-
-  PyObjectPtr pyInstance(PyInstance_New(
-      pyClass.get(), argsObjectList.release(), kwargsObjectList.release()));
-  CHECK_PY(pyInstance) << "Create class " << className << " failed.";
-  return pyInstance;
-}
-
-namespace py {
-char* repr(PyObject* obj) { return PyString_AsString(PyObject_Repr(obj)); }
-
-std::string getPyCallStack() {
-  std::ostringstream os;
-  printPyErrorStack(os, true);
-  return os.str();
-}
-
-PyObjectPtr import(const std::string& moduleName) {
-  auto module = PyImport_ImportModule(moduleName.c_str());
-  CHECK_PY(module) << "Import " << moduleName << "Error";
-  return PyObjectPtr(module);
-}
-
-}  // namespace py
-
-#endif
-extern "C" {
-extern const char enable_virtualenv_py[];
-}
-void initPython(int argc, char** argv) {
-#ifndef PADDLE_NO_PYTHON
-  Py_SetProgramName(argv[0]);
-  Py_Initialize();
-  PySys_SetArgv(argc, argv);
-  // python blocks SIGINT. Need to enable it.
-  signal(SIGINT, SIG_DFL);
-
-  // Manually activate virtualenv when user is using virtualenv
-  PyRun_SimpleString(enable_virtualenv_py);
-#endif
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h
deleted file mode 100644
index d5b2dbddd..000000000
--- a/paddle/legacy/utils/PythonUtil.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "paddle/legacy/utils/Util.h"
-
-#ifndef PADDLE_NO_PYTHON
-// must include the following two blocks, otherwise,
-// gcc compiler may produce warning
-#ifdef __APPLE__
-#define _POSIX_SOURCE
-#define _POSIX_C_SOURCE 200809L
-#define _XOPEN_SOURCE 700
-#endif
-
-#ifdef _POSIX_C_SOURCE
-#define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-#ifdef _XOPEN_SOURCE
-#define __TEMP_XOPEN_SOURCE _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-#include <Python.h>
-#include <frameobject.h>
-#endif
-
-#include <stdarg.h>
-#include <map>
-#include <mutex>
-// clang-format on
-
-namespace paddle {
-
-std::string callPythonFunc(const std::string& moduleName,
-                           const std::string& funcName,
-                           const std::vector<std::string>& args);
-
-#ifndef PADDLE_NO_PYTHON
-
-/**
- * Global lock guard of python C-api invokes.
- * NOTE: the lock of this guard is reentrant or recursive.
- */
-class PyGuard {
- public:
-  PyGuard();
-  PyGuard(const PyGuard& other) = delete;
-  PyGuard& operator=(const PyGuard& other) = delete;
-
- private:
-  std::lock_guard<std::recursive_mutex> guard_;
-};
-
-struct PyObjectDeleter {
-  void operator()(PyObject* obj) {
-    if (obj) {
-      Py_DECREF(obj);
-    }
-  }
-};
-
-typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
-
-PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
-                                   const std::string& funcName,
-                                   const std::vector<std::string>& args);
-
-PyObjectPtr createPythonClass(const std::string& moduleName,
-                              const std::string& className,
-                              const std::vector<std::string>& args,
-                              const std::map<std::string, std::string>& kwargs);
-
-#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
-
-namespace py {
-PyObjectPtr import(const std::string& moduleName);
-
-#if PY_MAJOR_VERSION >= 3
-/**
- * Cast a PyLong to int type T.
- * @tparam T return type.
- * @param [in] obj PyLong object.
- * @param [out] ok status for casting. False if error occured. nullptr if user
- *                 don't care is ok or not.
- * @return The value of python object, or 0 if not ok.
- */
-template <typename T>
-T castInt(PyObject* obj, bool* ok = nullptr) {
-  // Refer to https://www.python.org/dev/peps/pep-0237/, the int and long object
-  // were unified to long since python3
-  if (PyLong_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyLong_AsUnsignedLong(obj);
-  } else {
-    if (ok) *ok = false;
-    return (T)0;
-  }
-}
-
-// Convert PyAPI from 2.x to 3.x
-#define PyString_FromString PyUnicode_FromString
-#define PyString_AsString PyUnicode_AsUTF8
-
-#else
-/**
- * Cast a PyLong or PyInt to int type T.
- * @tparam T return type.
- * @param [in] obj PyLong or PyInt object.
- * @param [out] ok status for casting. False if error occured. nullptr if user
- *                 don't care is ok or not.
- * @return The value of python object, or 0 if not ok.
- */
-template <typename T>
-T castInt(PyObject* obj, bool* ok = nullptr) {
-  if (PyLong_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyLong_AsUnsignedLong(obj);
-  } else if (PyInt_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyInt_AsLong(obj);
-  } else {
-    if (ok) *ok = false;
-    return (T)0;
-  }
-}
-#endif  // PY_MAJOR_VERSION >= 3
-
-/**
- * Invoke repr of python object.
- *
- * Just like toString method in java.
- */
-char* repr(PyObject* obj);
-
-/**
- * Invoke repr of python object.
- */
-inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); }
-
-/**
- * Get Python Error Stack String.
- */
-std::string getPyCallStack();
-
-/**
- * Object Helper for PyObjectPtr.
- *
- * Implements getAttr method for object.
- */
-class ObjectHelper {
- public:
-  explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
-
-  /**
-   * get attribute
-   */
-  inline PyObject* getAttr(const std::string& field) const {
-    auto obj = PyObject_GetAttrString(obj_.get(), field.c_str());
-    CHECK_PY(obj) << "Cannot get attribute on python object " << obj_.get();
-    return obj;
-  }
-
-  /**
-   * Get Int attribute
-   * @param [in] field  attribute name.
-   * @param [out] ok true if this attribute is int.
-   * @tparam T int type.
-   * @return int value.
-   */
-  template <typename T>
-  T getIntAttr(const std::string& field, bool* ok = nullptr) const {
-    PyObjectPtr tmp(getAttr(field));
-    return castInt<T>(tmp.get(), ok);
-  }
-
-  /**
-   * Get int attribute. Log(Fatal) when not ok
-   * @param field attribute name.
-   * @return int value.
-   */
-  template <typename T>
-  T getIntAttrWithError(const std::string& field) const {
-    bool ok;
-    T tmp = getIntAttr<T>(field, &ok);
-    CHECK(ok) << "Cannot get integer attribute on object " << obj_.get();
-    return tmp;
-  }
-
-  /**
-   * Get bool attribute.
-   * @param field
-   * @param [out] isBoolType return true if attribute is bool type. If the
-   *                         attribute is not bool type, then an implicit
-   *                         conversion will happens, and will return the
-   *                         conversion result.
-   *
-   *                         Such as, if the attribute is 1, then the return
-   *                         value of function will be true, but the isBoolType
-   *                         will return false.
-   * @return
-   */
-  bool getBoolAttr(const std::string& field, bool* isBoolType = nullptr) const {
-    PyObjectPtr tmp(getAttr(field));
-    if (isBoolType) {
-      *isBoolType = PyBool_Check(tmp.get());
-    }
-    return PyObject_IsTrue(tmp.get());
-  }
-
- private:
-  const PyObjectPtr& obj_;
-};
-
-/**
- * Python Sequence Helper
- *
- * The python sequence means list or tuple.
- */
-class SequenceHelper {
- public:
-  explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
-    CHECK(PySequence_Check(seq_));
-  }
-
-  explicit SequenceHelper(PyObject* seq) : seq_(seq) {
-    CHECK(PySequence_Check(seq_));
-  }
-
-  inline size_t size() const { return (size_t)PySequence_Size(seq_); }
-
-  inline PyObject* operator[](size_t i) const {
-    return PySequence_Fast_GET_ITEM(seq_, i);
-  }
-
-  inline double getDouble(size_t i) const {
-    auto* ptr = (*this)[i];
-    return PyFloat_AsDouble(ptr);
-  }
-
-  /**
-   * Set a sequence item o[i] = obj;
-   * @param i index
-   * @param obj setted item.
-   * @param steal if steal = true, sequence will move object in iteself,
-   *              just like std::move. Otherwise, it will increase reference
-   *              count. Default is false.
-   */
-  inline void set(size_t i, const PyObjectPtr& obj, bool steal = false) {
-    this->set(i, obj.get(), steal);
-  }
-
-  /**
-   * Set a sequence item o[i] = obj;
-   */
-  inline void set(size_t i, PyObject* obj, bool steal = false) {
-    if (!steal) {
-      Py_XINCREF(obj);
-    }
-    if (PyTuple_Check(seq_)) {
-      CHECK_NE(PyTuple_SetItem(seq_, i, obj), -1) << getPyCallStack();
-    } else {
-      CHECK_NE(PySequence_SetItem(seq_, i, obj), -1) << getPyCallStack();
-    }
-  }
-
- private:
-  PyObject* seq_;
-};
-
-class DictHelper {
- public:
-  explicit DictHelper(PyObject* d) : dict_(d) {}
-
-  explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
-
-  void set(const std::string& key, PyObject* item) {
-    PyDict_SetItemString(dict_, key.c_str(), item);
-  }
-
-  void setBool(const std::string& key, bool b) {
-    this->set(key, PyBool_FromLong(b));
-  }
-
-  void setStringList(const std::string& key,
-                     const std::vector<std::string>& items) {
-    auto* list = PyList_New(items.size());
-    for (size_t i = 0; i < items.size(); ++i) {
-      PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
-    }
-    this->set(key, list);
-  }
-
- private:
-  inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
-
-  PyObject* dict_;
-};
-
-inline static bool isCallable(const PyObjectPtr& obj) {
-  return PyCallable_Check(obj.get());
-}
-
-/**
- * Wrap a callable object.
- */
-class CallableHelper {
- public:
-  explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
-    CHECK(py::isCallable(obj_));
-  }
-
-  ~CallableHelper() {}
-
-  /**
-   * reset args, and create new tuple.
-   * @param sz args size.
-   */
-  void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); }
-
-  /**
-   * Get args sequence. User can set/get by SequenceHelper.
-   */
-  SequenceHelper getArgs() { return SequenceHelper(args); }
-
-  /**
-   * Call python method, return an object.
-   */
-  PyObject* operator()() {
-    PyGuard guard;
-    return PyObject_Call(obj_.get(), args.get(), kwargs.get());
-  }
-
- private:
-  const PyObjectPtr& obj_;
-  PyObjectPtr args;
-  PyObjectPtr kwargs;
-};
-
-inline static PyObject* iterNext(const PyObjectPtr& context, bool* atEnd) {
-  PyGuard g;
-  PyObject* data = PyIter_Next(context.get());
-  if (data == nullptr) {
-    if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
-      PyErr_Clear();
-      *atEnd = true;
-      return nullptr;
-    } else if (PyErr_Occurred()) {
-      CHECK_PY(data) << "Calling iterator next error";
-      return nullptr;
-    } else {
-      *atEnd = false;
-      return data;  // just return none in iterator.
-    }
-  } else {
-    *atEnd = false;
-    return data;
-  }
-}
-}  // namespace py
-
-#endif
-
-/**
- * Initialize python.
- */
-void initPython(int argc, char** argv);
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Queue.h b/paddle/legacy/utils/Queue.h
deleted file mode 100644
index 189e1a14f..000000000
--- a/paddle/legacy/utils/Queue.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-
-#include "Locks.h"
-
-namespace paddle {
-
-/**
- * A thread-safe queue that automatically grows but never shrinks.
- * Dequeue a empty queue will block current thread. Enqueue an element
- * will wake up another thread that blocked by dequeue method.
- *
- * For example.
- * @code{.cpp}
- *
- * paddle::Queue<int> q;
- * END_OF_JOB=-1
- * void thread1() {
- *   while (true) {
- *     auto job = q.dequeue();
- *     if (job == END_OF_JOB) {
- *       break;
- *     }
- *     processJob(job);
- *   }
- * }
- *
- * void thread2() {
- *   while (true) {
- *      auto job = getJob();
- *      q.enqueue(job);
- *      if (job == END_OF_JOB) {
- *        break;
- *      }
- *   }
- * }
- *
- * @endcode
- */
-template <class T>
-class Queue {
- public:
-  /**
-   * @brief Construct Function. Default capacity of Queue is zero.
-   */
-  Queue() : numElements_(0) {}
-
-  ~Queue() {}
-
-  /**
-   * @brief enqueue an element into Queue.
-   * @param[in] el The enqueue element.
-   * @note This method is thread-safe, and will wake up another blocked thread.
-   */
-  void enqueue(const T& el) {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    elements_.emplace_back(el);
-    numElements_++;
-
-    queueCV_.notify_all();
-  }
-
-  /**
-   * @brief enqueue an element into Queue.
-   * @param[in] el The enqueue element. rvalue reference .
-   * @note This method is thread-safe, and will wake up another blocked thread.
-   */
-  void enqueue(T&& el) {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    elements_.emplace_back(std::move(el));
-    numElements_++;
-
-    queueCV_.notify_all();
-  }
-
-  /**
-   * Dequeue from a queue and return a element.
-   * @note this method will be blocked until not empty.
-   */
-  T dequeue() {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    queueCV_.wait(lock, [this]() { return numElements_ != 0; });
-    T el;
-
-    using std::swap;
-    // Becuase of the previous statement, the right swap() can be found
-    // via argument-dependent lookup (ADL).
-    swap(elements_.front(), el);
-
-    elements_.pop_front();
-    numElements_--;
-    if (numElements_ == 0) {
-      queueCV_.notify_all();
-    }
-    return el;
-  }
-
-  /**
-   * Return size of queue.
-   *
-   * @note This method is not thread safe. Obviously this number
-   * can change by the time you actually look at it.
-   */
-  inline int size() const { return numElements_; }
-
-  /**
-   * @brief is empty or not.
-   * @return true if empty.
-   * @note This method is not thread safe.
-   */
-  inline bool empty() const { return numElements_ == 0; }
-
-  /**
-   * @brief wait util queue is empty
-   */
-  void waitEmpty() {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    queueCV_.wait(lock, [this]() { return numElements_ == 0; });
-  }
-
-  /**
-   * @brief wait queue is not empty at most for some seconds.
-   * @param seconds wait time limit.
-   * @return true if queue is not empty. false if timeout.
-   */
-  bool waitNotEmptyFor(int seconds) {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    return queueCV_.wait_for(lock, std::chrono::seconds(seconds), [this] {
-      return numElements_ != 0;
-    });
-  }
-
- private:
-  std::deque<T> elements_;
-  int numElements_;
-  std::mutex queueLock_;
-  std::condition_variable queueCV_;
-};
-
-/*
- * A thread-safe circular queue that
- * automatically blocking calling thread if capacity reached.
- *
- * For example.
- * @code{.cpp}
- *
- * paddle::BlockingQueue<int> q(capacity);
- * END_OF_JOB=-1
- * void thread1() {
- *   while (true) {
- *     auto job = q.dequeue();
- *     if (job == END_OF_JOB) {
- *       break;
- *     }
- *     processJob(job);
- *   }
- * }
- *
- * void thread2() {
- *   while (true) {
- *      auto job = getJob();
- *      q.enqueue(job); //Block until q.size() < capacity .
- *      if (job == END_OF_JOB) {
- *        break;
- *      }
- *   }
- * }
- */
-template <typename T>
-class BlockingQueue {
- public:
-  /**
-   * @brief Construct Function.
-   * @param[in] capacity the max numer of elements the queue can have.
-   */
-  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {}
-
-  /**
-   * @brief enqueue an element into Queue.
-   * @param[in] x The enqueue element, pass by reference .
-   * @note This method is thread-safe, and will wake up another thread
-   * who was blocked because of the queue is empty.
-   * @note If it's size() >= capacity before enqueue,
-   * this method will block and wait until size() < capacity.
-   */
-  void enqueue(const T& x) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    notFull_.wait(lock, [&] { return queue_.size() < capacity_; });
-    queue_.push_back(x);
-    notEmpty_.notify_one();
-  }
-
-  /**
-   * Dequeue from a queue and return a element.
-   * @note this method will be blocked until not empty.
-   * @note this method will wake up another thread who was blocked because
-   * of the queue is full.
-   */
-  T dequeue() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    notEmpty_.wait(lock, [&] { return !queue_.empty(); });
-
-    T front(queue_.front());
-    queue_.pop_front();
-    notFull_.notify_one();
-    return front;
-  }
-
-  /**
-   * Return size of queue.
-   *
-   * @note This method is thread safe.
-   * The size of the queue won't change until the method return.
-   */
-  size_t size() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    return queue_.size();
-  }
-
-  /**
-   * @brief is empty or not.
-   * @return true if empty.
-   * @note This method is thread safe.
-   */
-  size_t empty() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    return queue_.empty();
-  }
-
- private:
-  std::mutex mutex_;
-  std::condition_variable notEmpty_;
-  std::condition_variable notFull_;
-  std::deque<T> queue_;
-  size_t capacity_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Stat.cpp b/paddle/legacy/utils/Stat.cpp
deleted file mode 100644
index ff1b1bf88..000000000
--- a/paddle/legacy/utils/Stat.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Stat.h"
-#include <algorithm>
-#include <iomanip>
-#include "Util.h"
-
-namespace paddle {
-
-StatSet globalStat("GlobalStatInfo");
-
-void Stat::addSample(uint64_t value) {
-  StatInfo* statInfo = statInfo_.get(false);
-  if (!statInfo) {
-    statInfo = new StatInfo(this);
-    statInfo_.set(statInfo);
-    std::lock_guard<std::mutex> guard(lock_);
-    threadLocalBuf_.push_back({statInfo, getTID()});
-  }
-  if (value > statInfo->max_) {
-    statInfo->max_ = value;
-  }
-  if (value < statInfo->min_) {
-    statInfo->min_ = value;
-  }
-  statInfo->total_ += value;
-  statInfo->count_++;
-}
-
-void Stat::mergeThreadStat(StatInfo& allThreadStat) {
-  allThreadStat = destructStat_;
-  for (auto& buf : threadLocalBuf_) {
-    if (buf.first->max_ > allThreadStat.max_) {
-      allThreadStat.max_ = buf.first->max_;
-    }
-    if (buf.first->min_ < allThreadStat.min_) {
-      allThreadStat.min_ = buf.first->min_;
-    }
-    allThreadStat.total_ += buf.first->total_;
-    allThreadStat.count_ += buf.first->count_;
-  }
-}
-
-void Stat::reset() {
-  std::lock_guard<std::mutex> guard(lock_);
-  for (auto& buf : threadLocalBuf_) {
-    buf.first->reset();
-  }
-}
-
-std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
-  std::lock_guard<std::mutex> guard(const_cast<Stat&>(stat).lock_);
-  auto showStat = [&](const StatInfo* info, pid_t tid, bool isFirst = true) {
-    uint64_t average = 0;
-    if (info->count_ > 0) {
-      outPut << std::setfill(' ') << std::left;
-      if (!isFirst) {
-        outPut << std::setw(42) << " ";
-      }
-      average = info->total_ / info->count_;
-      outPut << "Stat=" << std::setw(30) << stat.getName();
-      if (tid) {
-        outPut << " TID=" << std::setw(6) << tid;
-      }
-      outPut << " total=" << std::setw(10) << info->total_ * 0.001
-             << " avg=" << std::setw(10) << average * 0.001
-             << " max=" << std::setw(10) << info->max_ * 0.001
-             << " min=" << std::setw(10) << info->min_ * 0.001
-             << " count=" << std::setw(10) << info->count_ << std::endl;
-    }
-  };
-  if (!stat.getThreadInfo()) {
-    StatInfo infoVarTmp;
-    const_cast<Stat&>(stat).mergeThreadStat(infoVarTmp);
-    showStat(&infoVarTmp, 0);
-  } else {
-    bool isFirst = true;
-    for (auto& buf : stat.threadLocalBuf_) {
-      showStat(buf.first, buf.second, isFirst);
-      if (isFirst) isFirst = false;
-    }
-    showStat(&stat.destructStat_, 0);
-  }
-
-  return outPut;
-}
-
-void StatSet::printSegTimerStatus() {
-  ReadLockGuard guard(lock_);
-  LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-            << "======= StatSet: [" << name_ << "] status ======" << std::endl;
-  for (auto& stat : statSet_) {
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-              << *(stat.second);
-  }
-}
-
-void StatSet::printAllStatus() {
-#ifndef PADDLE_DISABLE_TIMER
-  printSegTimerStatus();
-#endif
-  LOG(INFO) << std::setiosflags(std::ios::left)
-            << "--------------------------------------------------"
-            << std::endl;
-}
-
-void StatSet::reset(bool clearRawData) {
-  ReadLockGuard guard(lock_);
-  for (auto& stat : statSet_) {
-    stat.second->reset();
-  }
-}
-
-void StatSet::setThreadInfo(const std::string& name, bool flag) {
-  ReadLockGuard guard(lock_);
-  auto iter = statSet_.find(name);
-  CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
-  iter->second->setThreadInfo(flag);
-}
-
-StatInfo::~StatInfo() {
-  if (stat_) {
-    std::lock_guard<std::mutex> guard(stat_->lock_);
-    if (stat_->destructStat_.max_ < this->max_) {
-      stat_->destructStat_.max_ = this->max_;
-    }
-    if (stat_->destructStat_.min_ > this->min_) {
-      stat_->destructStat_.min_ = this->min_;
-    }
-    stat_->destructStat_.total_ += this->total_;
-    stat_->destructStat_.count_ += this->count_;
-    stat_->threadLocalBuf_.remove({this, getTID()});
-  }
-}
-
-static unsigned g_profileCount = 0;
-static std::recursive_mutex g_profileMutex;
-
-GpuProfiler::GpuProfiler(std::string statName, std::string info)
-    : guard_(g_profileMutex) {
-  if (++g_profileCount == 1) {
-    LOG(INFO) << "Enable GPU Profiler Stat: [" << statName << "] " << info;
-    hl_profiler_start();
-  }
-}
-
-GpuProfiler::~GpuProfiler() {
-  if (--g_profileCount == 0) {
-    hl_profiler_end();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Stat.h b/paddle/legacy/utils/Stat.h
deleted file mode 100644
index 100e9eba9..000000000
--- a/paddle/legacy/utils/Stat.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <sys/time.h>
-#include <iostream>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-
-#include "Locks.h"
-#include "Logging.h"
-#include "ThreadLocal.h"
-#include "hl_gpu.h"
-
-namespace paddle {
-
-class Stat;
-
-class StatInfo {
- public:
-  explicit StatInfo(Stat* stat = nullptr) : stat_(stat) {
-    total_ = 0;
-    max_ = 0;
-    count_ = 0;
-    min_ = UINT64_MAX;
-  }
-
-  void reset() {
-    total_ = 0;
-    count_ = 0;
-    max_ = 0;
-    min_ = UINT64_MAX;
-  }
-
-  ~StatInfo();
-
-  Stat* stat_;
-  uint64_t total_;
-  uint64_t max_;
-  uint64_t count_;
-  uint64_t min_;
-};
-
-class Stat;
-typedef std::shared_ptr<Stat> StatPtr;
-
-class StatSet {
- public:
-  explicit StatSet(const std::string& name) : name_(name) {}
-  ~StatSet() {}
-
-  // print to LOG(INFO)
-  void printSegTimerStatus();
-  void printAllStatus();
-
-  StatPtr getStat(const std::string& name) {
-    {
-      ReadLockGuard guard(lock_);
-      auto it = statSet_.find(name);
-      if (it != statSet_.end()) {
-        return it->second;
-      }
-    }
-    StatPtr stat = std::make_shared<Stat>(name);
-    std::lock_guard<RWLock> guard(lock_);
-    auto ret = statSet_.insert(std::make_pair(name, stat));
-    return ret.first->second;
-  }
-
-  // true for showing stats for each thread
-  // false for showing stats aggragated over threads
-  void setThreadInfo(const std::string& name, bool flag);
-
-  // true for showing stats for each thread
-  // false for showing stats aggragated over threads
-  void setThreadInfo(bool flag) {
-    for (auto& iter : statSet_) {
-      setThreadInfo(iter.first, flag);
-    }
-  }
-
-  // reset the counters for all stats
-  // clearRawData means also clearing raw tuning data, because at pserver end,
-  // barrier rawData(timeVector_) is stateful, clearing it will cause rubbish
-  // data, while rawData should be cleared at the new pass (so complicated
-  // pserver code logic, -_- ).
-  void reset(bool clearRawData = true);
-
- private:
-  std::unordered_map<std::string, StatPtr> statSet_;
-  const std::string name_;
-  RWLock lock_;
-};
-
-extern StatSet globalStat;
-
-/*@brief : a simple stat*/
-class Stat {
- public:
-  explicit Stat(const std::string& statName)
-      : destructStat_(nullptr), name_(statName), openThreadInfo_(false) {}
-  ~Stat() {}
-
-  typedef std::list<std::pair<StatInfo*, pid_t>> ThreadLocalBuf;
-
-  const std::string& getName() const { return name_; }
-
-  void addSample(uint64_t value);
-
-  // clear all stats
-  void reset();
-
-  friend std::ostream& operator<<(std::ostream& outPut, const Stat& stat);
-
-  /*  Set operator << whether to print thread info.
-   *  If openThreadInfo_ == true, then print, else print merge thread info.
-   */
-  void setThreadInfo(bool flag) { openThreadInfo_ = flag; }
-
-  bool getThreadInfo() const { return openThreadInfo_; }
-
-  friend class StatInfo;
-
- private:
-  void mergeThreadStat(StatInfo& allThreadStat);
-
-  std::mutex lock_;
-  ThreadLocalBuf threadLocalBuf_;
-  StatInfo destructStat_;
-  ThreadLocal<StatInfo> statInfo_;
-  const std::string name_;
-  bool openThreadInfo_;
-};
-
-extern StatSet globalStat;
-
-inline StatPtr getStat(const std::string& name) {
-  return globalStat.getStat(name);
-}
-
-inline uint64_t nowInMicroSec() {
-  timeval tvTime;
-  (void)gettimeofday(&tvTime, NULL);
-  return tvTime.tv_sec * 1000000LU + tvTime.tv_usec;
-}
-
-/**
- * A simple help class to measure time interval
- */
-class Timer {
- public:
-  explicit Timer(bool autoStart = true) : total_(0), startStamp_(0) {
-    if (autoStart) {
-      start();
-    }
-  }
-  void start() { startStamp_ = nowInMicroSec(); }
-  void setStartStamp(uint64_t startStamp) { startStamp_ = startStamp; }
-  uint64_t stop() {
-    total_ += nowInMicroSec() - startStamp_;
-    return total_;
-  }
-
-  uint64_t get() const { return total_; }
-
-  void reset() { total_ = 0; }
-
- protected:
-  uint64_t total_;
-  uint64_t startStamp_;
-};
-
-class TimerOnce {
- public:
-  TimerOnce(Stat* stat,
-            const char* info = "",
-            uint64_t threshold = -1,
-            bool autoStart = true,
-            uint64_t startStamp = 0)
-      : stat_(stat), info_(info), timer_(autoStart), threshold_(threshold) {
-    if (!autoStart) {
-      timer_.setStartStamp(startStamp);
-    }
-  }
-  ~TimerOnce() {
-    uint64_t span = timer_.stop();
-    if (span >= threshold_) {
-      LOG(INFO) << "Stat: [" << stat_->getName() << "] " << info_
-                << " [Span:" << span / 1000 << "ms" << span % 1000 << "us"
-                << "] ";
-    }
-    stat_->addSample(span);
-  }
-
- private:
-  Stat* stat_;
-  const char* info_;
-  Timer timer_;
-  uint64_t threshold_;
-};
-
-inline uint64_t registerTimerArg1(uint64_t threshold = -1,
-                                  StatSet& statSet = globalStat) {
-  return threshold;
-}
-
-inline StatSet& registerTimerArg2(uint64_t threshold = -1,
-                                  StatSet& statSet = globalStat) {
-  return statSet;
-}
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define REGISTER_TIMER(statName, ...)
-#define REGISTER_TIMER_SET(statName, start, ...)
-#define REGISTER_TIMER_DYNAMIC(statName, ...)
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)
-#define REGISTER_TIMER_INFO(statName, info)
-#define FOR_TIMING(statement)
-
-#else
-
-#define FOR_TIMING(statement) statement
-
-// The default arguments are shown in the following line:
-// REGISTER_TIMER(statName, threshold = -1, statSet = globalStat)
-// TODO(yuyang18,wangyanfei01): if UNIQUE_NAME is needed
-#define REGISTER_TIMER(statName, ...)                             \
-  static ::paddle::StatPtr __stat =                               \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  ::paddle::TimerOnce __timerOnce(                                \
-      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
-
-#define REGISTER_TIMER_SET(statName, start, ...)                            \
-  static ::paddle::StatPtr __stat =                                         \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
-  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
-                                  "",                                       \
-                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
-                                  false,                                    \
-                                  start);
-
-// dynmaic timer, support to discriminate runtime entity, used in pserver
-#define REGISTER_TIMER_DYNAMIC(statName, ...)                     \
-  ::paddle::StatPtr __stat =                                      \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  ::paddle::TimerOnce __timerOnce(                                \
-      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
-
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)                    \
-  ::paddle::StatPtr __stat =                                                \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
-  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
-                                  "",                                       \
-                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
-                                  false,                                    \
-                                  start);
-
-#define REGISTER_TIMER_INFO(statName, info)                                 \
-  static ::paddle::StatPtr __stat = ::paddle::globalStat.getStat(statName); \
-  ::paddle::TimerOnce __timerOnce(                                          \
-      __stat.get(), info, 10 * 1000000LU /*threshold*/);
-
-#endif  // DISABLE_TIMER
-
-class GpuProfiler final {
- public:
-  GpuProfiler(std::string statName, std::string info);
-  ~GpuProfiler();
-
- private:
-  std::lock_guard<std::recursive_mutex> guard_;
-};
-
-#ifdef PADDLE_DISABLE_PROFILER
-
-#define REGISTER_GPU_PROFILER(statName, ...)
-
-#else
-
-#define REGISTER_GPU_PROFILER(statName, ...) \
-  GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
-
-#endif  // DISABLE_PROFILER
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/StringUtil.cpp b/paddle/legacy/utils/StringUtil.cpp
deleted file mode 100644
index 0c98e6db3..000000000
--- a/paddle/legacy/utils/StringUtil.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "StringUtil.h"
-
-namespace paddle {
-namespace str {
-
-bool endsWith(const std::string& str, const std::string& ext) {
-  if (str.size() >= ext.size() && ext == str.substr(str.size() - ext.size())) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void split(const std::string& str, char sep, std::vector<std::string>* pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-bool startsWith(const std::string& str, const std::string& prefix) {
-  if (prefix.size() <= str.size()) {
-    for (size_t i = 0; i < prefix.size(); ++i) {
-      if (str[i] != prefix[i]) return false;
-    }
-    return true;
-  } else {
-    return false;
-  }
-}
-
-}  // namespace str
-}  // namespace paddle
diff --git a/paddle/legacy/utils/StringUtil.h b/paddle/legacy/utils/StringUtil.h
deleted file mode 100644
index 95f071cb7..000000000
--- a/paddle/legacy/utils/StringUtil.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sstream>
-#include <string>
-#include <vector>
-#include "Logging.h"
-
-namespace paddle {
-
-namespace str {
-/// test whether a string ends with another string
-bool endsWith(const std::string& str, const std::string& ext);
-
-bool startsWith(const std::string& str, const std::string& prefix);
-
-/**
- * Use sep to split str into pieces.
- * If str is empty, *pieces will be empty.
- * If str ends with sep, the last piece will be an empty string.
- */
-void split(const std::string& str, char sep, std::vector<std::string>* pieces);
-
-/**
- * Cast string to type T with status.
- *
- * @param [in] s input string.
- * @param [out] ok status, return true if there is no error in casting. Set
- *              nullptr if user don't care error at all.
- * @return result of casting. If error occurred, a default value of T() will
- *         return.
- */
-template <class T>
-inline T toWithStatus(const std::string& s, bool* ok = nullptr) {
-  std::istringstream sin(s);
-  T v;
-  sin >> v;
-  if (ok) {
-    *ok = sin.eof() && !sin.fail();
-  }
-  return v;
-}
-
-/**
- * Cast type T to string with status.
- *
- * @param [in] v input value of type T.
- * @param [out] ok status, return true if there is no error in casting. Set
- *              nullptr if user don't care error at all.
- * @return result of casting. If error occurred, a empty string will be
- *              returned.
- */
-template <class T>
-inline std::string toWithStatus(const T v, bool* ok = nullptr) {
-  std::ostringstream sout;
-  sout << v;
-  if (ok) {
-    *ok = !sout.fail();
-  }
-  return sout.str();
-}
-
-/// Convert string to type T. It makes sure all the characters in s are used.
-/// Otherwise it will abort.
-///
-/// @tparam T type of return
-/// @param s string input.
-template <class T>
-inline T to(const std::string& s) {
-  bool ok;
-  T v = toWithStatus<T>(s, &ok);
-  CHECK(ok) << "Cannot convert s(" << s << ") to type " << typeid(T).name();
-  return v;
-}
-
-/// Convert type T to string.
-///
-/// @tparam T type of input value
-/// @param v input value of type T
-template <class T>
-std::string to_string(T v) {
-  bool ok;
-  std::string s = toWithStatus<T>(v, &ok);
-  CHECK(ok) << "Cannot convert v(" << v << ") to type std::string";
-  return s;
-}
-
-}  // namespace str
-
-#undef DEFINE_STRING_CONVERSION
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Thread.h b/paddle/legacy/utils/Thread.h
deleted file mode 100644
index 2ee6eba1a..000000000
--- a/paddle/legacy/utils/Thread.h
+++ /dev/null
@@ -1,615 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <thread>
-#include "Logging.h"
-#include "Util.h"
-
-#include "Queue.h"
-#include "ThreadLocal.h"
-
-#include <future>
-
-namespace paddle {
-
-/**
- * A simple wrapper for std::thread
- */
-
-class Thread {
- public:
-  /**
-   * @brief Construct Function. Default thread pointer is null.
-   */
-  Thread() { thread_ = nullptr; }
-
-  virtual ~Thread() {}
-
-  /**
-   * @brief Creat a new thread and call *run()* function.
-   */
-  void start() {
-    thread_.reset(new std::thread([this]() { this->run(); }));
-  }
-
-  /**
-   * @brief Detach the thread.
-   * It don't need to be waited until it finish.
-   */
-  void detach() { thread_->detach(); }
-
-  /**
-   * @brief Join the thread.
-   * It should be waited until it finish.
-   */
-  void join() { thread_->join(); }
-
-  /**
-   * @brief Define what to be done on this thread through override this
-   * function.
-   */
-  virtual void run() = 0;
-
- protected:
-  std::unique_ptr<std::thread> thread_;
-};
-
-/**
- * ThreadWorker maintains a job queue. It executes the jobs in the job queue
- * sequentianlly in a separate thread.
- *
- * Use addJob() to add a new job to the job queue.
- */
-class ThreadWorker : protected Thread {
- public:
-  typedef std::function<void()> JobFunc;
-
-  /**
-   * @brief Construct Function. Default size of job queue is 0 and not stopping.
-   */
-  ThreadWorker() : stopping_(false), empty_(true) { start(); }
-
-  /**
-   * @brief Destruct Function.
-   * If it's running, wait until all job finish and then stop it.
-   */
-  ~ThreadWorker() {
-    if (!stopping_) {
-      wait();
-      stop();
-    }
-  }
-
-  /**
-   * @brief Finish current running job and quit the thread.
-   */
-  void stop() {
-    stopping_ = true;
-    jobs_.enqueue([]() {});
-    join();
-  }
-
-  /**
-   * @brief Add a new job to the job queue.
-   */
-  void addJob(JobFunc func) {
-    empty_ = false;
-    jobs_.enqueue(func);
-  }
-
-  /**
-   * @brief Wait until all jobs was done (the job queue was empty).
-   */
-  void wait() {
-    finishCV_.wait([this] { return empty_; });
-  }
-
- protected:
-  /**
-   * @brief Execute jobs in the job queue sequentianlly,
-   * @note If finish all the jobs in the job queue,
-   * notifies all the waiting threads the job queue was empty.
-   */
-  virtual void run() {
-    while (true) {
-      JobFunc func = jobs_.dequeue();
-      if (stopping_) break;
-      func();
-      if (jobs_.empty()) {
-        finishCV_.notify_all([this] { empty_ = true; });
-      }
-    }
-  }
-
-  Queue<JobFunc> jobs_;
-  bool stopping_;
-  LockedCondition finishCV_;
-  bool empty_;
-};
-
-/**
- * SyncThreadPool maintains a pool of threads.
- * It executes the job use all workers in the pool.
- *
- * Use exec() to run a new job, job complete when exec returned.
- * Only one job can exec simultaneously.
- *
- * Each worker has an tid whose range is [0, getNumThreads()).
- * JobFunc can use tid to divide input data.
- */
-class SyncThreadPool {
- public:
-  typedef std::function<void(int tid, size_t numThreads)> JobFunc;
-
-  /**
-   * @brief Construct Function. No thread will be created.
-   */
-  SyncThreadPool() : jobStartBarrier_(0), jobFinishBarrier_(0) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Construct Fucntion. Create numWorkers of threads in the pool.
-   * @param[in] numWorkers Number of the workers in the pool.
-   * @param[in] checkOwner Default true. If checkOwner is true,
-   * this sync thread pool should be used by it's owner thread.
-   */
-  explicit SyncThreadPool(size_t numWorkers, bool checkOwner = true)
-      : stopping_(false),
-        jobStartBarrier_(numWorkers + 1),
-        jobFinishBarrier_(numWorkers + 1),
-        jobFunc_(nullptr),
-        checkOwner_(checkOwner) {
-    ownerThreadId_ = getTID();
-    workers_.resize(numWorkers);
-    start();
-  }
-
-  ~SyncThreadPool() {
-    if (!stopping_) {
-      stop();
-    }
-  }
-
-  /**
-   * @brief Return num of threads in the pool.
-   */
-  size_t getNumThreads() { return workers_.size(); }
-
-  /**
-   * @brief Execute a job using all the theads in the pool.
-   * @param[in] jobFunc The function to be executed.
-   * @param[in] ownerFunc Owner thread can do something in owerFunc when job
-   * executing.
-   * @note For the ownerFunc, tid=getNumThreads().
-   */
-  void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
-    if (checkOwner_) {
-      CHECK_EQ(ownerThreadId_, getTID())
-          << "this sync thread pool should be used in one thread";
-    }
-
-    CHECK(jobFunc_ == nullptr);
-    jobFunc_ = jobFunc;
-    jobStartBarrier_.wait();  // notify worker thread start job
-
-    if (ownerFunc) {
-      ownerFunc(workers_.size(), workers_.size());
-    }
-
-    jobFinishBarrier_.wait();  // wait all worker thread complete
-    jobFunc_ = nullptr;
-  }
-
-  /**
-   * @brief Execute a job using all the threads in the pool.
-   * And the owner thread will do the same job.
-   * @param jobFunc The job to be executed.
-   * @note  Assume that JobFunc will execute numThread + 1 times,
-   * with tid ranging [0,numThread]. The thread whose tid is numThread
-   * is the owner thread.
-   */
-  void execPlusOwner(JobFunc jobFunc) { exec(jobFunc, jobFunc); }
-
-  /**
-   * @brief Execute a job if has pool, else use caller thread as a worker.
-   * @param[in] pool The pool to execute the job.
-   * @param[in] jobFunc The job to be excuted.
-   */
-  static void execHelper(SyncThreadPool* pool, JobFunc jobFunc) {
-    if (pool) {
-      pool->exec(jobFunc);
-    } else {
-      jobFunc(0, 1);
-    }
-  }
-
- protected:
-  /**
-   * @brief Start all the workers in the pool, call their run() function.
-   */
-  void start() {
-    for (size_t i = 0; i < workers_.size(); ++i) {
-      workers_[i].reset(
-          new std::thread([this](int tid) { this->run(tid); }, i));
-    }
-  }
-
-  /**
-   * @brief Stop all the workers in the pool.
-   */
-  void stop() {
-    stopping_ = true;
-    // notify worker thread to stop
-    jobStartBarrier_.wait();
-
-    // stop workers
-    for (auto& thread : workers_) {
-      if (thread) {
-        thread->join();
-        thread.reset(nullptr);
-      }
-    }
-  }
-
-  /**
-   * @brief Execute the jobFunc_ using the worker thread tid, if not stopping.
-   */
-  void run(int tid) {
-    VLOG(1) << "SyncThreadPool worker thread " << tid;
-    // init seed deterministic, but differs from global srand()
-    ThreadLocalRand::initThreadSeed(tid + workers_.size());
-
-    while (true) {
-      jobStartBarrier_.wait();  // wait job
-
-      if (stopping_) {
-        break;
-      }
-
-      jobFunc_(tid, workers_.size());
-
-      jobFinishBarrier_.wait();  // notify job complete
-    }
-  }
-
- protected:
-  pid_t ownerThreadId_;
-  bool stopping_;
-  ThreadBarrier jobStartBarrier_;
-  ThreadBarrier jobFinishBarrier_;
-
-  JobFunc jobFunc_;
-  bool checkOwner_;
-  std::vector<std::unique_ptr<std::thread>> workers_;
-};
-
-/**
- * MultiThreadWorker maintains a job queue and a result queue.
- * It executes the jobs in the job queue and puts the results into the
- * result queue sequentially in multi separate threads.
- *
- * Add jobs:
- *
- *    Use addJob() to add a new job to the job queue
- *        (the user added jobs should not return nullptr).
- *
- *    Use stopAddJob() to stop adding new jobs to the job queue
- *        (addJob() can not be called after stopAddJob()).
- *
- * Normal stop:
- *
- *    Use waitResult() to get the results until nullptr is returned.
- *    Use stop() to exit normally
- *        (stopAddJob() should be called first).
- *
- * Force stop:
- *
- *    Use forceStop() to exit forcibly even though there are remaining jobs in
- * the
- * job queue.
- */
-template <class T>
-class MultiThreadWorker {
- public:
-  typedef T ResultType;
-  typedef std::shared_ptr<ResultType> ResultPtrType;
-  typedef std::function<ResultPtrType()> JobFunc;
-  /**
-   * @brief Construct Function. Initialize the multithread worker.
-   * @param[in] workerNum Number of the workers.
-   * @param[in] queueCapacity Capapcity of the result queue.
-   */
-  MultiThreadWorker(size_t workerNum, size_t queueCapacity)
-      : stopping_(false),
-        jobAdding_(true),
-        nullResultNum_(0),
-        results_(queueCapacity) {
-    workers_.resize(workerNum);
-    for (auto& worker : workers_) {
-      worker.reset(new std::thread([this]() { this->run(); }));
-    }
-  }
-
-  /**
-   * @brief Destruct Function. Force stop the workers
-   * even though there are remaining jobs in the job queue.
-   */
-  virtual ~MultiThreadWorker() { forceStop(); }
-
-  /**
-   * @brief Stop all the workers normally.
-   * @note stopAddJob() should be called before it.
-   */
-  void stop() {
-    CHECK(!jobAdding_) << "stopAddJob() should be called before stop()";
-    for (auto& worker : workers_) {
-      if (worker) {
-        worker->join();
-        worker = nullptr;
-      }
-    }
-    stopping_ = true;
-  }
-
-  /**
-   * @brief Stop all the workers forcibly.
-   * @note This function will call stopAddJob() first
-   * and empty the result queue.
-   */
-  void forceStop() {
-    if (!stopping_) {
-      stopping_ = true;
-      stopAddJob();
-      while (nullptr != waitResult()) {
-      }
-      stop();
-    }
-  }
-
-  /**
-   * @brief Add a job to the job queue.
-   * @note Job can not be added after calling stopAddJob().
-   */
-  void addJob(JobFunc func) {
-    CHECK(jobAdding_) << "addJob() can not be called after stopAddJob()";
-    jobs_.enqueue(func);
-  }
-
-  /**
-   * @brief Stop adding new jobs to the job queue.
-   * @note This fuction enqueue a return nullptr function to the job queue.
-   */
-  void stopAddJob() {
-    for (size_t i = 0; i < workers_.size(); ++i) {
-      jobs_.enqueue([]() { return nullptr; });
-    }
-    jobAdding_ = false;
-  }
-
-  /**
-   * @brief Dequeue the first result in the result queue and return it.
-   * @note If the result queue is empty, wait until it's not empty
-   * or return nullptr if all the results have been returned.
-   */
-  ResultPtrType waitResult() {
-    while (true) {
-      ResultPtrType result = results_.dequeue();
-      if (result) {
-        return result;
-      }
-
-      ++nullResultNum_;
-      if (nullResultNum_ == workers_.size()) {
-        return nullptr;
-      }
-    }
-  }
-
-  /**
-   * @brief The result queue is empty or not.
-   * @return true if empty.
-   */
-  bool testResult() { return results_.empty(); }
-
- protected:
-  /**
-   * @brief Do the jobs in the job queue sequentianlly
-   * and enqueue the result into the result queue.
-   * @note A nullptr will be enqueued into the resulte queue, when a worker
-   * finished.
-   */
-  virtual void run() {
-    while (true) {
-      JobFunc func = jobs_.dequeue();
-      ResultPtrType result = func();
-      if (result == nullptr || stopping_) {
-        // When a worker finished, a nullptr would be enqueued into results_
-        results_.enqueue(nullptr);
-        break;
-      }
-      results_.enqueue(result);
-    }
-  }
-
-  bool stopping_;
-  bool jobAdding_;
-  size_t nullResultNum_;
-  Queue<JobFunc> jobs_;
-  BlockingQueue<ResultPtrType> results_;
-  std::vector<std::unique_ptr<std::thread>> workers_;
-};
-
-/**
- * AsyncThreadPool maintains a job queue and threads pool.
- * It executes the jobs from queue asynchronously.
- *
- * Add jobs:
- *
- *    Use addJob() to add a new job to the job queue and get a std::future
- *    result. The caller's thread continues running. Call std::future::get()
- *    when the result's value is needed, and the caller's thread may be
- *    blocked until thread-pool finished the job.
- *
- *    Use addBatchJobs() to add a batch of jobs.
- *    Unlike addJob()'s asynchronization, addBatchJobs will block caller's
- *    thread until all jobs in the batch are finished.
- *
- * Stop:
- *    Use stop() to stop the thread pool. Job can be added once stopped.
- *
- * Process-wide Singleton:
- *    Use AsyncThreadPool::ProcessChannel(N) first to create N threads.
- *    Then call AsyncThreadPool::ProcessChannel() to get the process-wide global
- *    thread pool.
- */
-class AsyncThreadPool {
- public:
-  typedef std::function<void()> JobFunc;
-
-  AsyncThreadPool() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Construct Function. Install all the workers.
-   * @param[in] threadNum Number of the threads, must greater than 1.
-   */
-  explicit AsyncThreadPool(size_t threadNum) {
-    CHECK_GT(threadNum, 1U);
-    stopping_ = false;
-    workers_.resize(threadNum);
-    for (auto& worker : workers_) {
-      worker.reset(new std::thread([this]() { this->run(); }));
-    }
-  }
-
-  ~AsyncThreadPool() {
-    if (!stopping_) {
-      stop();
-    }
-  }
-
-  /**
-   * @brief Stop all the workers normally.
-   */
-  void stop() {
-    stopping_ = true;
-    for (size_t i = 0; i < workers_.size(); i++) {
-      jobs_.enqueue([] {});
-    }
-    for (auto& worker : workers_) {
-      worker->join();
-    }
-  }
-
-  /**
-   * @brief A process-wide singleton. Used as a global thread pool
-   *    It should be initialized by calling
-   *    AsyncThreadPool::ProcessChannel(N) first to create N threads,
-   *    then call AsyncThreadPool::ProcessChannel() will get the thread pool.
-   */
-  static AsyncThreadPool& ProcessChannel(size_t initThreadNum = 0) {
-    static std::shared_ptr<AsyncThreadPool> channel(
-        new AsyncThreadPool(initThreadNum));
-    return *channel;
-  }
-
-  /**
-   * @brief Add a job to queue and return a std::future.
-   * @note The job will be executed
-   * asynchronously.
-   * Call std::future::get() when the execturation result is needed;
-   */
-  template <class F, class... Args>
-  auto addJob(F&& f, Args&&... args)
-      -> std::future<typename std::result_of<F(Args...)>::type> {
-    CHECK(!stopping_) << "AsyncThreadPool is closed";
-    typedef typename std::result_of<F(Args...)>::type T;
-
-    auto task = std::make_shared<std::packaged_task<T()>>(
-        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
-    auto res = task->get_future();
-    jobs_.enqueue([task] { (*task)(); });
-    return res;
-  }
-
-  /**
-   * @brief Add a batch of jobs to the queue. The main thread will be blocked
-   * until these jobs are finished.
-   * The results will be stored in  `results` according to `jobs` order.
-   *
-   * @tparam F should have a return value.
-   *
-   * @param[in] jobs a vector of executable objection.
-   * @param[in] results a vector to store the results.
-   *
-   * @note *results* may need to be carefully cleared before *addBatchJobs()*.
-   */
-  template <class F>
-  void addBatchJobs(const std::vector<F>& jobs,
-                    std::vector<typename std::result_of<F()>::type>& results) {
-    typedef typename std::result_of<F()>::type T;
-    static_assert(!std::is_same<T, void>::value,
-                  "should pass a non-void function as job");
-
-    std::vector<std::future<T>> resFuts;
-    for (const auto& job : jobs) {
-      resFuts.emplace_back(addJob(job));
-    }
-    for (auto& fut : resFuts) {
-      results.emplace_back(fut.get());
-    }
-  }
-
-  /**
-   * @brief Add a batch of jobs reguardless of its result.
-   * @tparam F don't need to have a return value.
-   * @param[in] jobs a vector of executable objection.
-   */
-  template <class F>
-  void addBatchJobs(const std::vector<F>& jobs) {
-    CHECK(!stopping_) << "AsyncThreadPool is closed";
-    std::vector<std::future<bool>> tmpRes;
-
-    for (const auto& job : jobs) {
-      tmpRes.emplace_back(addJob([&job] {
-        job();
-        return true;
-      }));
-    }
-
-    for (auto& res : tmpRes) {
-      res.get();
-    }
-  }
-
- protected:
-  /**
-   * @brief Execute the jobs in the job queue.
-   */
-  void run() {
-    while (true) {
-      JobFunc func = jobs_.dequeue();
-      func();
-      if (stopping_) break;
-    }
-  }
-
- private:
-  std::vector<std::unique_ptr<std::thread>> workers_;
-  Queue<JobFunc> jobs_;
-  bool stopping_;
-};  // class AsyncThreadPool
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/ThreadLocal.cpp b/paddle/legacy/utils/ThreadLocal.cpp
deleted file mode 100644
index 58fe51bd4..000000000
--- a/paddle/legacy/utils/ThreadLocal.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ThreadLocal.h"
-
-#include <gflags/gflags.h>
-
-#include "Util.h"
-
-DEFINE_bool(thread_local_rand_use_global_seed,
-            false,
-            "Whether to use global seed in thread local rand.");
-
-namespace paddle {
-
-unsigned int ThreadLocalRand::defaultSeed_ = 1;
-ThreadLocal<unsigned int> ThreadLocalRand::seed_;
-
-unsigned int* ThreadLocalRand::getSeed() {
-  unsigned int* p = seed_.get(false /*createLocal*/);
-  if (!p) {  // init seed
-    if (FLAGS_thread_local_rand_use_global_seed) {
-      p = new unsigned int(defaultSeed_);
-    } else if (getpid() == getTID()) {  // main thread
-      // deterministic, but differs from global srand()
-      p = new unsigned int(defaultSeed_ - 1);
-    } else {
-      p = new unsigned int(defaultSeed_ + getTID());
-      VLOG(3) << "thread use undeterministic rand seed:" << *p;
-    }
-    seed_.set(p);
-  }
-  return p;
-}
-
-ThreadLocal<std::default_random_engine> ThreadLocalRandomEngine::engine_;
-std::default_random_engine& ThreadLocalRandomEngine::get() {
-  auto engine = engine_.get(false);
-  if (!engine) {
-    engine = new std::default_random_engine;
-    int defaultSeed = ThreadLocalRand::getDefaultSeed();
-    engine->seed(FLAGS_thread_local_rand_use_global_seed
-                     ? defaultSeed
-                     : defaultSeed + getTID());
-    engine_.set(engine);
-  }
-  return *engine;
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/ThreadLocal.h b/paddle/legacy/utils/ThreadLocal.h
deleted file mode 100644
index 6268b73a8..000000000
--- a/paddle/legacy/utils/ThreadLocal.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef _WIN32
-#include <pthread.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#include <sys/types.h>
-#include <map>
-#include <mutex>
-#include <random>
-#include "Logging.h"
-#include "Util.h"
-
-namespace paddle {
-
-/**
- * Thread local storage for object.
- * Example:
- *
- * Declarartion:
- * ThreadLocal<vector<int>> vec_;
- *
- * Use in thread:
- * vector<int>& vec = *vec; // obtain the thread specific object
- * vec.resize(100);
- *
- * Note that this ThreadLocal will desconstruct all internal data when thread
- * exits
- * This class is suitable for cases when frequently creating and deleting
- * threads.
- *
- * Consider implementing a new ThreadLocal if one needs to frequently create
- * both instances and threads.
- *
- * see also ThreadLocalD
- */
-template <class T>
-class ThreadLocal {
- public:
-  ThreadLocal() {
-    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
-  }
-  ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
-
-  /**
-   * @brief get thread local object.
-   * @param if createLocal is true and thread local object is never created,
-   * return a new object. Otherwise, return nullptr.
-   */
-  T* get(bool createLocal = true) {
-    T* p = (T*)pthread_getspecific(threadSpecificKey_);
-    if (!p && createLocal) {
-      p = new T();
-      int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK_EQ(ret, 0);
-    }
-    return p;
-  }
-
-  /**
-   * @brief set (overwrite) thread local object. If there is a thread local
-   * object before, the previous object will be destructed before.
-   *
-   */
-  void set(T* p) {
-    if (T* q = get(false)) {
-      dataDestructor(q);
-    }
-    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
-  }
-
-  /**
-   * return reference.
-   */
-  T& operator*() { return *get(); }
-
-  /**
-   * Implicit conversion to T*
-   */
-  operator T*() { return get(); }
-
- private:
-  static void dataDestructor(void* p) { delete (T*)p; }
-
-  pthread_key_t threadSpecificKey_;
-};
-
-/**
- * Almost the same as ThreadLocal, but note that this ThreadLocalD will
- * destruct all internal data when ThreadLocalD instance destructs.
- *
- * This class is suitable for cases when frequently creating and deleting
- * objects.
- *
- * see also ThreadLocal
- *
- * @note The type T must implemented default constructor.
- */
-template <class T>
-class ThreadLocalD {
- public:
-  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
-  ~ThreadLocalD() {
-    pthread_key_delete(threadSpecificKey_);
-    for (auto t : threadMap_) {
-      dataDestructor(t.second);
-    }
-  }
-
-  /**
-   * @brief Get thread local object. If not exists, create new one.
-   */
-  T* get() {
-    T* p = (T*)pthread_getspecific(threadSpecificKey_);
-    if (!p) {
-      p = new T();
-      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
-      updateMap(p);
-    }
-    return p;
-  }
-
-  /**
-   * @brief Set thread local object. If there is an object create before, the
-   * old object will be destructed.
-   */
-  void set(T* p) {
-    if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
-      dataDestructor(q);
-    }
-    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
-    updateMap(p);
-  }
-
-  /**
-   * @brief Get reference of the thread local object.
-   */
-  T& operator*() { return *get(); }
-
- private:
-  static void dataDestructor(void* p) { delete (T*)p; }
-
-  void updateMap(T* p) {
-    pid_t tid = getTID();
-    CHECK_NE(tid, -1);
-    std::lock_guard<std::mutex> guard(mutex_);
-    auto ret = threadMap_.insert(std::make_pair(tid, p));
-    if (!ret.second) {
-      ret.first->second = p;
-    }
-  }
-
-  pthread_key_t threadSpecificKey_;
-  std::mutex mutex_;
-  std::map<pid_t, T*> threadMap_;
-};
-
-/**
- * @brief Thread-safe C-style random API.
- */
-class ThreadLocalRand {
- public:
-  /**
-   * initSeed just like srand,
-   * called by main thread,
-   * init defaultSeed for all thread
-   */
-  static void initSeed(unsigned int seed) { defaultSeed_ = seed; }
-
-  /**
-   * initThreadSeed called by each thread,
-   * init seed to defaultSeed + *tid*
-   * It should be called after main initSeed and before using rand()
-   * It's optional, getSeed will init seed if it's not initialized.
-   */
-  static void initThreadSeed(int tid) {
-    seed_.set(new unsigned int(defaultSeed_ + tid));
-  }
-
-  /// thread get seed, then can call rand_r many times.
-  /// Caller thread can modify the seed value if it's necessary.
-  ///
-  /// if flag thread_local_rand_use_global_seed set,
-  /// the seed will be set to defaultSeed in thread's first call.
-  static unsigned int* getSeed();
-
-  /// like ::rand
-  static int rand() { return rand_r(getSeed()); }
-
-  /**
-   * Get defaultSeed for all thread.
-   */
-  static int getDefaultSeed() { return defaultSeed_; }
-
- protected:
-  static unsigned int defaultSeed_;
-  static ThreadLocal<unsigned int> seed_;
-};
-
-/**
- * @brief Thread-safe C++ style random engine.
- */
-class ThreadLocalRandomEngine {
- public:
-  /**
-   * get random_engine for each thread.
-   *
-   * Engine's seed will be initialized by ThreadLocalRand.
-   */
-  static std::default_random_engine& get();
-
- protected:
-  static ThreadLocal<std::default_random_engine> engine_;
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Util.cpp b/paddle/legacy/utils/Util.cpp
deleted file mode 100644
index 2755fdd9c..000000000
--- a/paddle/legacy/utils/Util.cpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Util.h"
-
-#include <dirent.h>
-#include <signal.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
-#ifdef __SSE3__
-#include <pmmintrin.h>
-#endif
-
-#include <fstream>
-#include <mutex>
-
-#include <gflags/gflags.h>
-
-#include "CpuId.h"
-#include "CustomStackTrace.h"
-#include "Logging.h"
-#include "StringUtil.h"
-#include "Thread.h"
-#include "ThreadLocal.h"
-#include "Version.h"
-
-DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
-
-#ifdef WITH_GOOGLE_PERFTOOLS
-/*
-  In order to use google profiler, you need to install gperftools,
-  which can be obtained at:
-  https://gperftools.googlecode.com/files/gperftools-2.0.tar.gz
-
-  gperftools should be configured with --enable-frame-pointers
-
-  Then link the executable with -lprofiler.
-
-  After you start the application, you can use kill -s signal PID to
-  start/stop profiling. The profile data will be stored in file
-  FLAGS_profile_data_file, which can be analyzed by pprof.
-*/
-
-#include <gperftools/profiler.h>
-
-DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
-DEFINE_string(profile_data_file, "gperf.prof", "file for storing profile data");
-
-static void profilerSwitch(int signalNumber) {
-  bool static started = false;
-
-  if (!started) {
-    if (ProfilerStart(FLAGS_profile_data_file.c_str())) {
-      LOG(INFO) << "Profiler started";
-    } else {
-      LOG(WARNING) << "Can't turn on cpu profiling for "
-                   << FLAGS_profile_data_file;
-    }
-  } else {
-    ProfilerStop();
-    LOG(INFO) << "Profiler stopped";
-  }
-  started = !started;
-}
-
-static void installProfilerSwitch() {
-  sighandler_t oldHandler = signal(FLAGS_profile_signal, profilerSwitch);
-
-  if (!oldHandler) {
-    LOG(INFO) << "Using signal " << FLAGS_profile_signal
-              << " to turn on/off profiler";
-  } else {
-    LOG(WARNING) << "Signal " << FLAGS_profile_signal << " is already in use\n";
-  }
-}
-
-#else
-
-static void installProfilerSwitch() {}
-
-#endif  // WITH_GOOGLE_PERFTOOLS
-
-namespace paddle {
-
-pid_t getTID() {
-#if defined(__APPLE__) || defined(__OSX__)
-  // syscall is deprecated: first deprecated in macOS 10.12.
-  // syscall is unsupported;
-  // syscall pid_t tid = syscall(SYS_thread_selfid);
-  uint64_t tid;
-  pthread_threadid_np(NULL, &tid);
-#else
-#ifndef __NR_gettid
-#define __NR_gettid 224
-#endif
-  pid_t tid = syscall(__NR_gettid);
-#endif
-  CHECK_NE((int)tid, -1);
-  return tid;
-}
-
-static bool g_initialized = false;
-typedef std::pair<int, std::function<void()>> PriorityFuncPair;
-typedef std::vector<PriorityFuncPair> InitFuncList;
-static InitFuncList* g_initFuncs = nullptr;
-static std::once_flag g_onceFlag;
-void registerInitFunction(std::function<void()> func, int priority) {
-  if (g_initialized) {
-    LOG(FATAL) << "registerInitFunction() should only called before initMain()";
-  }
-  if (!g_initFuncs) {
-    g_initFuncs = new InitFuncList();
-  }
-  g_initFuncs->push_back(std::make_pair(priority, func));
-}
-
-void runInitFunctions() {
-  std::call_once(g_onceFlag, []() {
-    VLOG(3) << "Calling runInitFunctions";
-    if (g_initFuncs) {
-      std::sort(g_initFuncs->begin(),
-                g_initFuncs->end(),
-                [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
-                  return x.first > y.first;
-                });
-      for (auto& f : *g_initFuncs) {
-        f.second();
-      }
-      delete g_initFuncs;
-      g_initFuncs = nullptr;
-    }
-    g_initialized = true;
-    VLOG(3) << "Call runInitFunctions done.";
-  });
-}
-
-void initMain(int argc, char** argv) {
-  installLayerStackTracer();
-  std::string line;
-  for (int i = 0; i < argc; ++i) {
-    line += argv[i];
-    line += ' ';
-  }
-
-#ifndef GFLAGS_GFLAGS_H_
-  namespace gflags = google;
-#endif
-
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  initializeLogging(argc, argv);
-  LOG(INFO) << "commandline: " << line;
-  CHECK_EQ(argc, 1) << "Unknown commandline argument: " << argv[1];
-
-  installProfilerSwitch();
-
-#ifdef __SSE__
-  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-#endif
-#ifdef __SSE3__
-  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#endif
-
-  if (FLAGS_seed == 0) {
-    unsigned int t = time(NULL);
-    srand(t);
-    ThreadLocalRand::initSeed(t);
-    LOG(INFO) << "random number seed=" << t;
-  } else {
-    srand(FLAGS_seed);
-    ThreadLocalRand::initSeed(FLAGS_seed);
-  }
-
-  if (FLAGS_use_gpu) {
-    // This is the initialization of the CUDA environment,
-    // need before runInitFunctions.
-    // TODO(hedaoyuan) Can be considered in the runInitFunctions,
-    // but to ensure that it is the first to initialize.
-    hl_start();
-    hl_init(FLAGS_gpu_id);
-  }
-
-  version::printVersion();
-  checkCPUFeature().check();
-  runInitFunctions();
-}
-
-std::string readFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-
-  // get length of file:
-  is.seekg(0, is.end);
-  size_t length = is.tellg();
-  is.seekg(0, is.beg);
-  std::string str(length, (char)0);
-  CHECK(is.read(&str[0], length)) << "Fail to read file: " << fileName;
-  return str;
-}
-
-namespace path {
-
-std::string basename(const std::string& path) {
-  size_t pos = path.rfind(sep);
-  ++pos;
-  return path.substr(pos, std::string::npos);
-}
-
-std::string dirname(const std::string& path) {
-  size_t pos = path.rfind(sep);
-  if (pos == std::string::npos) return std::string();
-  return path.substr(0, pos);
-}
-
-std::string join(const std::string& part1, const std::string& part2) {
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-}  // namespace path
-
-void copyFileToPath(const std::string& file, const std::string& dir) {
-  VLOG(3) << "copy " << file << " to " << dir;
-  std::string fileName = path::basename(file);
-  std::string dst = path::join(dir, fileName);
-  std::ifstream source(file, std::ios_base::binary);
-  std::ofstream dest(dst, std::ios_base::binary);
-  CHECK(source) << "Fail to open " << file;
-  CHECK(dest) << "Fail to open " << dst;
-  dest << source.rdbuf();
-  source.close();
-  dest.close();
-}
-
-bool fileExist(const char* filename) { return (access(filename, 0) == 0); }
-
-void touchFile(const char* filename) {
-  if (!fileExist(filename)) {
-    std::ofstream os(filename);
-  }
-}
-
-int isDir(const char* path) {
-  struct stat s_buf;
-  if (stat(path, &s_buf)) {
-    return 0;
-  }
-  return S_ISDIR(s_buf.st_mode);
-}
-
-void rmDir(const char* folderName) {
-  if (isDir(folderName)) {
-    DIR* dp;
-    struct dirent* ep;
-    std::string buf;
-    dp = opendir(folderName);
-    while ((ep = readdir(dp)) != NULL) {
-      if (strcmp(ep->d_name, ".") && strcmp(ep->d_name, "..")) {
-        buf = std::string(folderName) + "/" + std::string(ep->d_name);
-        if (isDir(buf.c_str())) {
-          rmDir(buf.c_str());
-        } else {
-          remove(buf.c_str());
-        }
-      }
-    }
-    closedir(dp);
-    rmdir(folderName);
-  }
-}
-
-void mkDir(const char* filename) {
-  if (mkdir(filename, 0755)) {
-    CHECK(errno == EEXIST) << filename << "mkdir failed!";
-  }
-}
-
-void mkDirRecursively(const char* dir) {
-  struct stat sb;
-
-  if (*dir == 0) return;  // empty string
-  if (!stat(dir, &sb)) return;
-
-  mkDirRecursively(path::dirname(dir).c_str());
-
-  mkDir(dir);
-}
-
-void loadFileList(const std::string& fileListFileName,
-                  std::vector<std::string>& fileList) {
-  std::ifstream is(fileListFileName);
-  CHECK(is) << "Fail to open " << fileListFileName;
-  std::string line;
-  while (is) {
-    if (!getline(is, line)) break;
-    fileList.push_back(line);
-  }
-}
-
-double getMemoryUsage() {
-#if defined(__ANDROID__)
-  return 0.0;
-#else
-  FILE* fp = fopen("/proc/meminfo", "r");
-  CHECK(fp) << "failed to fopen /proc/meminfo";
-  size_t bufsize = 256 * sizeof(char);
-  char* buf = new (std::nothrow) char[bufsize];
-  CHECK(buf);
-  int totalMem = -1;
-  int freeMem = -1;
-  int bufMem = -1;
-  int cacheMem = -1;
-  while (getline(&buf, &bufsize, fp) >= 0) {
-    if (0 == strncmp(buf, "MemTotal", 8)) {
-      if (1 != sscanf(buf, "%*s%d", &totalMem)) {
-        LOG(FATAL) << "failed to get MemTotal from string: [" << buf << "]";
-      }
-    } else if (0 == strncmp(buf, "MemFree", 7)) {
-      if (1 != sscanf(buf, "%*s%d", &freeMem)) {
-        LOG(FATAL) << "failed to get MemFree from string: [" << buf << "]";
-      }
-    } else if (0 == strncmp(buf, "Buffers", 7)) {
-      if (1 != sscanf(buf, "%*s%d", &bufMem)) {
-        LOG(FATAL) << "failed to get Buffers from string: [" << buf << "]";
-      }
-    } else if (0 == strncmp(buf, "Cached", 6)) {
-      if (1 != sscanf(buf, "%*s%d", &cacheMem)) {
-        LOG(FATAL) << "failed to get Cached from string: [" << buf << "]";
-      }
-    }
-    if (totalMem != -1 && freeMem != -1 && bufMem != -1 && cacheMem != -1) {
-      break;
-    }
-  }
-  CHECK(totalMem != -1 && freeMem != -1 && bufMem != -1 && cacheMem != -1)
-      << "failed to get all information";
-  fclose(fp);
-  delete[] buf;
-  double usedMem = 1.0 - 1.0 * (freeMem + bufMem + cacheMem) / totalMem;
-  return usedMem;
-#endif
-}
-
-SyncThreadPool* getGlobalSyncThreadPool() {
-  static std::unique_ptr<SyncThreadPool> syncThreadPool;
-  if (syncThreadPool &&
-      syncThreadPool->getNumThreads() != (size_t)FLAGS_trainer_count) {
-    LOG(WARNING) << "trainer_count changed in training process!";
-    syncThreadPool.reset(nullptr);
-  }
-  if (!syncThreadPool) {
-    syncThreadPool.reset(new SyncThreadPool(FLAGS_trainer_count));
-  }
-  return syncThreadPool.get();
-}
-
-size_t calculateServiceNum(const std::string& pservers, int ports_num) {
-  std::vector<std::string> hosts;
-  str::split(pservers, ',', &hosts);
-  return hosts.size() * ports_num;
-}
-
-void memcpyWithCheck(void* dest,
-                     const void* src,
-                     size_t num,
-                     const void* srcEnd) {
-  int minus = (char*)srcEnd - (char*)src - num;
-  CHECK_LE(0, minus) << "memcpyWithCheck: copy " << num
-                     << " bytes data out of range.";
-  memcpy(dest, src, num);
-}
-
-hl_activation_mode_t hlActiveType(const std::string& type) {
-  if (type == "sigmoid") {
-    return HL_ACTIVATION_SIGMOID;
-  } else if (type == "relu") {
-    return HL_ACTIVATION_RELU;
-  } else if (type == "tanh") {
-    return HL_ACTIVATION_TANH;
-  } else if (type == "linear" || type == "") {
-    return HL_ACTIVATION_LINEAR;
-  } else {
-    LOG(FATAL) << "Do not support activation type " << type;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Util.h b/paddle/legacy/utils/Util.h
deleted file mode 100644
index 3a878b2b3..000000000
--- a/paddle/legacy/utils/Util.h
+++ /dev/null
@@ -1,597 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef _WIN32
-#include <sys/syscall.h>  // for syscall()
-#endif
-#include <sys/types.h>
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#include "Common.h"
-#include "Logging.h"
-#include "TrainerConfig.pb.h"
-
-#include "Flags.h"
-#include "hl_gpu.h"
-
-#if defined(__ANDROID__) && (__ANDROID_API__ < 21)
-inline int rand_r(unsigned int* seedp) {
-  (void)seedp;
-  return rand();
-}
-#endif
-
-#ifdef _WIN32
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#include <windows.h>
-
-template <typename T>
-inline int __builtin_clz(const T& value) {
-  DWORD leadning_zero = 0;
-  if (_BitScanReverse(&leadning_zero, value)) {
-    return static_cast<int>(sizeof(T) * 8 - leadning_zero);
-  } else {
-    return static_cast<int>(0);
-  }
-}
-
-inline int __builtin_clzl(const unsigned long& value) {
-  return __builtin_clz(value);
-}
-
-inline int __builtin_clzll(const unsigned long long& value) {
-  return __builtin_clz(value);
-}
-
-#define pid_t int
-#endif
-
-/**
- * Loop over the elements in a container
- * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
- *                 or make it a inline method?
- * Example:
- * FOR_EACH(it, array) {
- *  sum += *it;
- * }
- */
-#define FOR_EACH(iterator_name, container)                              \
-  for (auto iterator_name = (container).begin(), e = (container).end(); \
-       iterator_name != e;                                              \
-       ++iterator_name)
-
-/**
- * Loop over the elements in a container in reverse order
- * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
- *                 or make it a inline method?
- * Example:
- * FOR_EACH_R(it, array) {
- *  sum += *it;
- * }
- */
-#define FOR_EACH_R(iterator_name, container)                              \
-  for (auto iterator_name = (container).rbegin(), e = (container).rend(); \
-       iterator_name != e;                                                \
-       ++iterator_name)
-
-namespace paddle {
-
-// return the thread id used by glog
-pid_t getTID();
-
-/**
- * return the 1-based index of the highest bit set
- *
- * for x > 0:
- * \f[
- *    findLastSet(x) = 1 + \floor*{\log_{2}x}
- * \f]
- */
-inline constexpr size_t findLastSet(size_t x) {
-  return std::is_same<size_t, unsigned int>::value
-             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
-             : (std::is_same<size_t, unsigned long>::value  // NOLINT
-                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
-                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
-}
-
-/**
- * calculate the non-negative remainder of a/b
- * @param[in] a
- * @param[in] b, should be positive
- * @return the non-negative remainder of a / b
- */
-inline int mod(int a, int b) {
-  int r = a % b;
-  return r >= 0 ? r : r + b;
-}
-
-/**
- * find the value given a key k from container c.
- * If the key can be found, the value is stored in *value
- * return true if the key can be found. false otherwise.
- */
-template <class K, class V, class C>
-bool mapGet(const K& k, const C& c, V* value) {
-  auto it = c.find(k);
-  if (it != c.end()) {
-    *value = it->second;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template <class Container, class T>
-static bool contains(const Container& container, const T& val) {
-  return std::find(container.begin(), container.end(), val) != container.end();
-}
-
-/**
- * pop and get the front element of a container
- */
-template <typename Container>
-typename Container::value_type pop_get_front(Container& c) {
-  typename Container::value_type v;
-  swap(v, c.front());
-  c.pop_front();
-  return v;
-}
-
-#define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
-
-/**
- * Initialize some creators or initFunctions for layers and data
- * providers.
- * Client codes should call this function before they refer any other
- * codes that use the layer class and data provider class.
- *
- * Codes inside 'core' directory can call initMain which calls
- * runInitFunctions directly, while codes outside core can simply
- * call runInitFunctions if they don't need the commandline flags
- * designed for PADDLE main procedure.
- */
-void runInitFunctions();
-
-/**
- * Initialize logging and parse commandline
- */
-void initMain(int argc, char** argv);
-
-// read the whole file into a string
-std::string readFile(const std::string& fileName);
-
-// copy file to path
-void copyFileToPath(const std::string& file, const std::string& path);
-
-// test file exist or not
-bool fileExist(const char* filename);
-// touch file if not exist
-void touchFile(const char* filename);
-// make dir if not exist
-void mkDir(const char* filename);
-void mkDirRecursively(const char* filename);
-
-void rmDir(const char* folderName);
-
-// load a file list file into a vector(fileList)
-void loadFileList(const std::string& fileListFileName,
-                  std::vector<std::string>& fileList);
-
-/**
- * Register a function, the function will be called in initMain(). Functions
- * with higher priority will be called first. The execution order of functions
- * with same priority is not defined.
- */
-void registerInitFunction(std::function<void()> func, int priority = 0);
-class InitFunction {
- public:
-  explicit InitFunction(std::function<void()> func, int priority = 0) {
-    registerInitFunction(func, priority);
-  }
-};
-
-/**
- * Class SetDevice provides a mechanism for set device enviroment.
- * When a SetDevice object is created, it attempts to change device enviroment.
- * When the SetDevice object is destructed, it will restore device environment.
- */
-class SetDevice {
- public:
-  explicit SetDevice(int deviceId) {
-    isSet_ = deviceId >= 0;
-    devId_ = 0;
-    if (isSet_) {
-      devId_ = hl_get_device();
-      hl_set_device(deviceId);
-    }
-  }
-  ~SetDevice() {
-    if (isSet_) {
-      hl_set_device(devId_);
-    }
-  }
-
- protected:
-  bool isSet_;
-  int devId_;
-};
-
-/**
- * Enables direct access to memory allocations on a peer device(d2).
- * input:
- * *d1* is device can direct access device d2.
- * *d2* is peer device to enable direct access to by the d1 device.
- */
-inline void enablePeerAccess(int d1, int d2) {
-#ifdef PADDLE_WITH_CUDA
-  if (hl_device_can_access_peer(d1, d2)) {
-    SetDevice dev(d1);
-    hl_device_enable_peer_access(d2);
-  }
-#else
-  LOG(FATAL) << "Paddle should be compiled in GPU mode to use this method.";
-#endif
-}
-
-/**
- * Change the gpu computation mode to asynchronized mode for the rest of the
- * compilation block. This is useful if the computation consists of multiple
- * small steps. Async mode can overlap the cuda-kernel launch overhead with the
- * actual computation.
- * Example:
- * {
- *    AsycnGpuBlock asyncBlock;
- *    do_some_gpu_computation
- * }
- */
-class AsyncGpuBlock {
- public:
-  AsyncGpuBlock() : syncFlag_(hl_get_sync_flag()) { hl_set_sync_flag(false); }
-  ~AsyncGpuBlock() {
-    if (syncFlag_) {
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      hl_set_sync_flag(syncFlag_);
-    }
-  }
-
- private:
-  bool syncFlag_;
-};
-
-inline bool useGpu(int deviceId) {
-  return FLAGS_parallel_nn ? (deviceId >= 0 ? true : false) : FLAGS_use_gpu;
-}
-
-/*
- * hppl activation mode
- */
-hl_activation_mode_t hlActiveType(const std::string& type);
-
-/**
- * Return value: memory usage ratio (from 0-1)
- */
-double getMemoryUsage();
-
-/**
- * split array by index.
- * used by sync multi thread task,
- * each thread call calcSplitArrayInterval with thread id,
- * get a interval as return.
- * input:
- * *totalSize* is array size,
- * *tId* is thread id, *tSize* is total worker thread num
- * output:
- * start and end index as a std::pair
- */
-inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
-                                                        size_t tId,
-                                                        size_t tSize) {
-  size_t start = totalSize * tId / tSize;
-  size_t end = totalSize * (tId + 1) / tSize;
-  return std::make_pair(start, end);
-}
-
-/**
- * same as above, but split at boundary of block.
- */
-inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
-                                                        size_t tId,
-                                                        size_t tSize,
-                                                        size_t blockSize) {
-  size_t numBlocks = totalSize / blockSize;
-  if (numBlocks * blockSize < totalSize) {
-    numBlocks++;
-  }
-
-  auto interval = calcSplitArrayInterval(numBlocks, tId, tSize);
-  size_t start = std::min(interval.first * blockSize, totalSize);
-  size_t end = std::min(interval.second * blockSize, totalSize);
-
-  return std::make_pair(start, end);
-}
-
-// Calculate the number of pservers/dservers based
-// on the host list and port_num.
-size_t calculateServiceNum(const std::string& pservers, int ports_num);
-
-/**
- * sort and unique ids vector.
- */
-inline void uniqueIds(std::vector<uint32_t>& ids) {
-  std::sort(ids.begin(), ids.end());
-  auto endpos = std::unique(ids.begin(), ids.end());
-  ids.erase(endpos, ids.end());
-}
-
-/**
- * Read Type value
- */
-template <typename T>
-T readT(char*& p, const char* pEnd) {
-  int minus = pEnd - p - sizeof(T);
-  CHECK_LE(0, minus) << "readT: Out of range.";
-  T v = *reinterpret_cast<T*>(p);
-  p += sizeof(T);
-  return v;
-}
-
-void memcpyWithCheck(void* dest,
-                     const void* src,
-                     size_t num,
-                     const void* srcEnd);
-
-/**
- * A global sync thread pool, has #FLAGS_trainer_count of threads.
- * can be used in main thread.
- */
-class SyncThreadPool;
-SyncThreadPool* getGlobalSyncThreadPool();
-
-namespace path {
-
-// directory separator
-const char sep = '/';
-
-// Return the base name of pathname path.
-std::string basename(const std::string& path);
-
-// Return the directory name of path. If the path does not contains any
-// directory, it returns an empty string.
-std::string dirname(const std::string& path);
-
-/*
-  Join two path components intelligently.
-  The return value is the concatenation of part1 and part2 with exactly one
-  directory separator (path.sep) following each non-empty part except the last,
-  meaning that the result will only end in a separator if the last part is
-  empty.
-  If a component is an absolute path, all previous components are thrown away
-  and joining continues from the absolute path component.
-*/
-std::string join(const std::string& part1, const std::string& part2);
-
-template <typename... Args>
-std::string join(const std::string& part1,
-                 const std::string& part2,
-                 Args... args) {
-  return join(join(part1, part2), args...);
-}
-
-}  // namespace path
-
-/**
- * A Checker for each invoke of method in same thread.
- */
-class SameThreadChecker {
- public:
-  SameThreadChecker() {}
-
-  /**
-   * Disable copy
-   */
-  SameThreadChecker(const SameThreadChecker& other) = delete;
-  SameThreadChecker& operator=(const SameThreadChecker& other) = delete;
-
-  /**
-   * Each invoke of check method should be in same thread, otherwise, it will
-   * failed and core dump.
-   */
-  void check() {
-    std::thread::id curThreadId = std::this_thread::get_id();
-    std::call_once(onceFlag_, [&] { invokeThreadId_ = curThreadId; });
-    CHECK_EQ(invokeThreadId_, curThreadId)
-        << "This method should invoke in "
-           "same thread, but first invoked in "
-        << invokeThreadId_ << " current invoked in " << curThreadId;
-  }
-
- private:
-  std::once_flag onceFlag_;
-  std::thread::id invokeThreadId_;
-};
-
-/**
- * Key-Value Cache Helper.
- *
- * It store a object instance global. User can invoke get method by key and a
- * object creator callback. If there is a instance stored in cache, then it will
- * return a shared_ptr of it, otherwise, it will invoke creator callback, create
- * a new instance store global, and return it.
- *
- * The cache instance will release when nobody hold a reference to it.
- *
- * The KType is the key type.
- * The VType is the value type.
- * The Hash is the key hasher object.
- */
-template <typename KType, typename VType, typename Hash>
-class WeakKVCache {
- public:
-  WeakKVCache() {}
-
-  std::shared_ptr<VType> get(const KType& key,
-                             const std::function<VType*()>& creator) {
-    std::lock_guard<std::mutex> guard(this->lock_);
-    auto it = this->storage_.find(key);
-    if (it != this->storage_.end()) {
-      auto& val = it->second;
-      auto retVal = val.lock();
-      if (retVal != nullptr) {
-        return retVal;
-      }  // else fall trough. Because it is WeakPtr Cache.
-    }
-    auto rawPtr = creator();
-    CHECK(rawPtr != nullptr);
-    std::shared_ptr<VType> retVal(rawPtr);
-    this->storage_[key] = retVal;
-    return retVal;
-  }
-
- private:
-  std::mutex lock_;
-  std::unordered_map<KType, std::weak_ptr<VType>, Hash> storage_;
-};
-
-/**
- * @brief The ScopedCallbacks class is a callback invoker when object is
- *        created and destroyed.
- */
-template <typename CallbackType, typename... Args>
-class ScopedCallbacks {
- public:
-  ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
-      : exit_(std::bind(exit, args...)) {
-    enter(args...);
-  }
-
-  ScopedCallbacks(const ScopedCallbacks& other) = delete;
-  ScopedCallbacks& operator=(const ScopedCallbacks& other) = delete;
-
-  ~ScopedCallbacks() { exit_(); }
-
- private:
-  std::function<void()> exit_;
-};
-
-/**
- * std compatible allocator with memory alignment.
- * @tparam T type of allocator elements.
- * @tparam Alignment the alignment in bytes.
- */
-template <typename T, size_t Alignment>
-class AlignedAllocator {
- public:
-  /// std campatible typedefs.
-  typedef T* pointer;
-  typedef const T* const_pointer;
-  typedef T& reference;
-  typedef const T& const_reference;
-  typedef T value_type;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-
-  T* address(T& r) const { return &r; }
-
-  const T* address(const T& r) const { return &r; }
-
-  size_t max_size() const {
-    return std::numeric_limits<size_t>::max() / sizeof(T);
-  }
-
-  template <typename U>
-  struct rebind {
-    typedef AlignedAllocator<U, Alignment> other;
-  };
-
-  bool operator==(const AlignedAllocator& other) const { return true; }
-
-  bool operator!=(const AlignedAllocator& other) const {
-    return !(*this == &other);
-  }
-
-  void construct(const T* p, const T& t) const {
-    void* pv = const_cast<T*>(p);
-    new (pv) T(t);
-  }
-
-  void deallocate(const T* p, const size_type n) const {
-    (void)(n);  // UNUSED n
-    free(const_cast<T*>(p));
-  }
-
-  void destroy(const T* p) const { p->~T(); }
-
-  AlignedAllocator() {}
-  ~AlignedAllocator() {}
-
-  AlignedAllocator(const AlignedAllocator&) {}
-  template <typename U>
-  AlignedAllocator(const AlignedAllocator<U, Alignment>&) {}
-
-  /**
-   * @brief allocate n elements of type T, the first address is aligned by
-   *        Alignment bytes.
-   * @param n element count.
-   * @return begin address of allocated buffer
-   * @throw std::length_error for n * sizeof(T) is overflowed.
-   * @throw std::bad_alloc
-   */
-  T* allocate(const size_type n) const {
-    if (n == 0) {
-      return nullptr;
-    }
-    if (n > max_size()) {
-      throw std::length_error("AlignAllocator<T>::allocate() - Int Overflow.");
-    }
-    void* r = nullptr;
-    CHECK_EQ(posix_memalign(&r, Alignment * 8, sizeof(T) * n), 0);
-    if (r == nullptr) {
-      throw std::bad_alloc();
-    } else {
-      return static_cast<T*>(r);
-    }
-  }
-
-  template <typename U>
-  T* allocate(const std::size_t n, const U* /* const hint */) const {
-    return this->allocate(n);
-  }
-
- private:
-  AlignedAllocator& operator=(const AlignedAllocator&);  // disable
-};
-
-class Deprecated {
- public:
-  explicit Deprecated(const std::string& msg = "") {
-    if (msg.empty()) {
-      LOG(WARNING) << "This class is deprecated, please do not use this class.";
-    } else {
-      LOG(WARNING) << msg;
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/Version.cpp b/paddle/legacy/utils/Version.cpp
deleted file mode 100644
index 731c30842..000000000
--- a/paddle/legacy/utils/Version.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Version.h"
-
-#include <iomanip>
-#include <numeric>
-#include "Flags.h"
-#include "Util.h"
-
-DECLARE_bool(version);
-
-namespace paddle {
-namespace version {
-
-void printVersion(std::ostream& os) {
-#ifndef PADDLE_VERSION
-#define PADDLE_VERSION "unknown"
-#endif
-// converts macro to string
-// https://gcc.gnu.org/onlinedocs/cpp/Stringification.html
-#define xstr(s) str(s)
-#define str(s) #s
-
-  os << "paddle version: " << xstr(PADDLE_VERSION) << std::endl
-     << std::boolalpha << "\t"
-     << "withGpu: " << version::isWithGpu() << std::endl
-     << "\t"
-     << "withAvx: " << version::isWithAvx() << std::endl
-     << "\t"
-     << "withPyDataProvider: " << version::isWithPyDataProvider() << std::endl
-     << "\t"
-     << "withTimer: " << version::isWithTimer() << std::endl
-     << "\t"
-     << "withFpga: " << version::isWithFpga() << std::endl
-     << "\t"
-     << "real byte size: " << version::sizeofReal() << std::endl
-     << std::endl;
-}
-
-void printVersion() {
-  if (FLAGS_version) {
-    printVersion(std::cout);
-    exit(0);
-  }
-}
-
-}  //  namespace version
-}  //  namespace paddle
diff --git a/paddle/legacy/utils/Version.h b/paddle/legacy/utils/Version.h
deleted file mode 100644
index 004d62451..000000000
--- a/paddle/legacy/utils/Version.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stddef.h>
-#include <iostream>
-#include "Common.h"
-
-namespace paddle {
-
-/**
- * namespace paddle::version
- * Some constexpr to detect paddle version.
- *    use paddle_trainer --version to print version information.
- *
- * Possible output as follow:
- * paddle version:
- *    withGpu: false
- *    withAvx: false
- *    withPyDataProvider: true
- *    withTimer: false
- *    withFpga: false
- *    real byte size: 4
- */
-
-namespace version {
-
-/**
- * @brief print paddle version and exit when --version flag setted. Otherwise,
- * do nothing.
- */
-void printVersion();
-
-void printVersion(std::ostream& os);
-/**
- * @brief isWithGpu
- * @return return true if paddle compiled with GPU
- */
-constexpr bool isWithGpu() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-
-/**
- * @brief isWithPyDataProvider
- * @return return true if paddle compiled with PyDataProvider
- *
- * @note: A complete python interpreter is embeded into paddle binary if paddle
- * is compiled with PyDataProvider. Then the config parser just invoke python
- * method. Otherwise, ConfigParser just serializes config into protobuf, and
- * pass to C++ by using stdio.
- */
-constexpr bool isWithPyDataProvider() {
-#ifdef PADDLE_NO_PYTHON
-  return false;
-#else
-  return true;
-#endif
-}
-
-/**
- * @brief isWithTimer
- * @return true if paddle compiled with timer.
- */
-constexpr bool isWithTimer() {
-#ifdef PADDLE_DISABLE_TIMER
-  return false;
-#else
-  return true;
-#endif
-}
-
-/**
- * @brief isWithAvx
- * @return true if paddle compiled with AVX instructs.
- */
-constexpr bool isWithAvx() {
-#ifdef __AVX__
-  return true;
-#else
-  return false;
-#endif
-}
-
-/**
- * @brief isWithFpga
- * @return true if paddle compiled with FPGA for prediction.
- */
-constexpr bool isWithFpga() {
-#ifdef PADDLE_USE_FPGA
-  return true;
-#else
-  return false;
-#endif
-}
-
-/**
- * @brief sizeofReal
- * @return return the byte size of real
- */
-constexpr size_t sizeofReal() { return sizeof(real); }
-
-/**
- * @brief isPaddleUseDouble
- * @return true if paddle compiled with double precision.
- */
-constexpr bool isPaddleUseDouble() { return sizeofReal() == sizeof(double); }
-
-/**
- * @brief isPaddleUseFloat
- * @return true if paddle compiled with float precision
- */
-constexpr bool isPaddleUseFloat() { return sizeofReal() == sizeof(float); }
-
-}  //  namespace version
-
-}  //  namespace paddle
diff --git a/paddle/legacy/utils/arch/linux/Locks.cpp b/paddle/legacy/utils/arch/linux/Locks.cpp
deleted file mode 100644
index 32d351e33..000000000
--- a/paddle/legacy/utils/arch/linux/Locks.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Locks.h"
-#include <semaphore.h>
-#include <unistd.h>
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-class SemaphorePrivate {
- public:
-  sem_t sem;
-};
-
-Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
-  sem_init(&m->sem, 0, initValue);
-}
-
-Semaphore::~Semaphore() {
-  sem_destroy(&m->sem);
-  delete m;
-}
-
-bool Semaphore::timeWait(struct timespec* ts) {
-  return (0 == sem_timedwait(&m->sem, ts));
-}
-
-void Semaphore::wait() { sem_wait(&m->sem); }
-
-void Semaphore::post() { sem_post(&m->sem); }
-
-/// SpinLockPrivate
-
-#ifdef PADDLE_USE_PTHREAD_SPINLOCK
-
-class SpinLockPrivate {
- public:
-  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
-  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
-
-  inline void lock() { pthread_spin_lock(&lock_); }
-  inline void unlock() { pthread_spin_unlock(&lock_); }
-
-  pthread_spinlock_t lock_;
-  char padding_[64 - sizeof(pthread_spinlock_t)];
-};
-
-#else
-// clang-format off
-#include <cstddef>
-#include <atomic>
-// clang-format on
-
-class SpinLockPrivate {
- public:
-  inline void lock() {
-    while (lock_.test_and_set(std::memory_order_acquire)) {
-    }
-  }
-  inline void unlock() { lock_.clear(std::memory_order_release); }
-
-  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
-  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
-};
-
-#endif
-
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
-SpinLock::~SpinLock() { delete m; }
-void SpinLock::lock() { m->lock(); }
-void SpinLock::unlock() { m->unlock(); }
-
-/// ThreadBarrierPrivate
-
-#ifdef PADDLE_USE_PTHREAD_BARRIER
-
-class ThreadBarrierPrivate {
- public:
-  pthread_barrier_t barrier_;
-
-  inline explicit ThreadBarrierPrivate(int count) {
-    pthread_barrier_init(&barrier_, nullptr, count);
-  }
-
-  inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
-
-  inline void wait() { pthread_barrier_wait(&barrier_); }
-};
-
-#else
-
-class ThreadBarrierPrivate {
- public:
-  pthread_mutex_t mutex_;
-  pthread_cond_t cond_;
-  int count_;
-  int tripCount_;
-
-  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
-    CHECK_NE(cnt, 0);
-    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
-    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
-  }
-
-  inline ~ThreadBarrierPrivate() {
-    pthread_cond_destroy(&cond_);
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  /**
-   * @brief wait
-   * @return true if the last wait
-   */
-  inline bool wait() {
-    pthread_mutex_lock(&mutex_);
-    ++count_;
-    if (count_ >= tripCount_) {
-      count_ = 0;
-      pthread_cond_broadcast(&cond_);
-      pthread_mutex_unlock(&mutex_);
-      return true;
-    } else {
-      pthread_cond_wait(&cond_, &mutex_);
-      pthread_mutex_unlock(&mutex_);
-      return false;
-    }
-  }
-};
-
-#endif
-
-/// ThreadBarrier
-
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
-ThreadBarrier::~ThreadBarrier() { delete m; }
-void ThreadBarrier::wait() { m->wait(); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/arch/osx/Excepts.cpp b/paddle/legacy/utils/arch/osx/Excepts.cpp
deleted file mode 100644
index 2b7d6dca8..000000000
--- a/paddle/legacy/utils/arch/osx/Excepts.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Excepts.h"
-
-#if defined(__APPLE__) || defined(__OSX__)
-#if defined(__arm__) || defined(__arm64__)
-// TODO(liuyiqun): implement the arm version
-int fegetexcept(void) { return -1; }
-int feenableexcept(unsigned int excepts) { return -1; }
-int fedisableexcept(unsigned int excepts) { return -1; }
-#else
-int fegetexcept(void) {
-  static fenv_t fenv;
-  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
-}
-
-int feenableexcept(unsigned int excepts) {
-  static fenv_t fenv;
-  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
-
-  if (fegetenv(&fenv)) return -1;
-  old_excepts = fenv.__control & FE_ALL_EXCEPT;
-
-  // unmask
-  fenv.__control &= ~new_excepts;
-  fenv.__mxcsr &= ~(new_excepts << 7);
-
-  return (fesetenv(&fenv) ? -1 : old_excepts);
-}
-
-int fedisableexcept(unsigned int excepts) {
-  static fenv_t fenv;
-  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
-
-  if (fegetenv(&fenv)) return -1;
-  old_excepts = fenv.__control & FE_ALL_EXCEPT;
-
-  // mask
-  fenv.__control |= new_excepts;
-  fenv.__mxcsr |= new_excepts << 7;
-
-  return (fesetenv(&fenv) ? -1 : old_excepts);
-}
-#endif
-#endif
diff --git a/paddle/legacy/utils/arch/osx/Locks.cpp b/paddle/legacy/utils/arch/osx/Locks.cpp
deleted file mode 100644
index b68c48f0c..000000000
--- a/paddle/legacy/utils/arch/osx/Locks.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Locks.h"
-#include <dispatch/dispatch.h>
-#include <libkern/OSAtomic.h>
-#include <atomic>
-#include "paddle/legacy/utils/Logging.h"
-
-namespace paddle {
-
-class SemaphorePrivate {
- public:
-  ~SemaphorePrivate() { dispatch_release(sem); }
-
-  dispatch_semaphore_t sem;
-};
-
-Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
-  m->sem = dispatch_semaphore_create(initValue);
-}
-
-Semaphore::~Semaphore() { delete m; }
-
-bool Semaphore::timeWait(timespec *ts) {
-  dispatch_time_t tm = dispatch_walltime(ts, 0);
-  return (0 == dispatch_semaphore_wait(m->sem, tm));
-}
-
-void Semaphore::wait() {
-  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
-}
-
-void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
-
-class SpinLockPrivate {
- public:
-  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
-  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
-};
-
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
-SpinLock::~SpinLock() { delete m; }
-
-void SpinLock::lock() {
-  while (m->lock_.test_and_set(std::memory_order_acquire)) {
-  }
-}
-
-void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
-
-class ThreadBarrierPrivate {
- public:
-  pthread_mutex_t mutex_;
-  pthread_cond_t cond_;
-  int count_;
-  int tripCount_;
-
-  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
-    CHECK_NE(cnt, 0);
-    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
-    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
-  }
-
-  inline ~ThreadBarrierPrivate() {
-    pthread_cond_destroy(&cond_);
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  /**
-   * @brief wait
-   * @return true if the last wait
-   */
-  inline bool wait() {
-    pthread_mutex_lock(&mutex_);
-    ++count_;
-    if (count_ >= tripCount_) {
-      count_ = 0;
-      pthread_cond_broadcast(&cond_);
-      pthread_mutex_unlock(&mutex_);
-      return true;
-    } else {
-      pthread_cond_wait(&cond_, &mutex_);
-      pthread_mutex_unlock(&mutex_);
-      return false;
-    }
-  }
-};
-
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
-ThreadBarrier::~ThreadBarrier() { delete m; }
-void ThreadBarrier::wait() { m->wait(); }
-
-}  // namespace paddle
diff --git a/paddle/legacy/utils/enable_virtualenv.py b/paddle/legacy/utils/enable_virtualenv.py
deleted file mode 100644
index 4e998381e..000000000
--- a/paddle/legacy/utils/enable_virtualenv.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-
-def __activate_virtual_env__():
-    __path__ = os.getenv('VIRTUAL_ENV')
-    if __path__ is None:
-        return
-    __script__ = os.path.join(__path__, 'bin', 'activate_this.py')
-    execfile(__script__, {'__file__': __script__})
-
-
-__activate_virtual_env__()
diff --git a/paddle/legacy/utils/tests/CMakeLists.txt b/paddle/legacy/utils/tests/CMakeLists.txt
deleted file mode 100644
index 4af01db5c..000000000
--- a/paddle/legacy/utils/tests/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-add_simple_unittest(test_Thread)
-add_simple_unittest(test_StringUtils)
-add_simple_unittest(test_CustomStackTrace)
-add_simple_unittest(test_ThreadBarrier)
-add_simple_unittest(test_SpinLock)
-add_simple_unittest(test_SIMDFlags)
-add_simple_unittest(test_Error)
-
-add_executable(
-    test_CustomStackTracePrint
-    test_CustomStackTracePrint.cpp
-)
-link_paddle_exe(test_CustomStackTracePrint)
-if(NOT APPLE)
-    add_test(NAME test_CustomStackTracePrint
-        COMMAND ${PADDLE_SOURCE_DIR}/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
diff --git a/paddle/legacy/utils/tests/test_CustomStackTrace.cpp b/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
deleted file mode 100644
index 2a418e3ae..000000000
--- a/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gflags/gflags.h>  // NOLINT
-#include <gtest/gtest.h>    // NOLINT
-
-#include "paddle/legacy/utils/CustomStackTrace.h"
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 10, "testing thread number");
-
-void testNormalImpl(
-    const std::function<void(paddle::CustomStackTrace<std::string>&,
-                             size_t,
-                             size_t,
-                             paddle::ThreadBarrier&,
-                             paddle::ThreadBarrier&)>& callback) {
-  paddle::CustomStackTrace<std::string> tracer;
-  paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
-  paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
-  constexpr size_t countDown = 10;
-  constexpr size_t layerSize = 1000;
-  std::vector<std::unique_ptr<std::thread>> threads;
-  threads.reserve(FLAGS_test_thread_num);
-
-  for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) {
-    threads.emplace_back(
-        new std::thread([&tracer, &startBarrier, &doneBarrier, &callback] {
-          callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
-        }));
-  }
-  size_t cntDown = countDown;
-  while (cntDown-- > 0) {
-    startBarrier.wait();
-    sleep(1);
-    doneBarrier.wait();
-    ASSERT_TRUE(tracer.empty());
-  }
-
-  for (auto& thread : threads) {
-    thread->join();
-  }
-}
-
-TEST(CustomStackTrace, normalTrain) {
-  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                    size_t countDown,
-                    size_t layerSize,
-                    paddle::ThreadBarrier& start,
-                    paddle::ThreadBarrier& finish) {
-    while (countDown-- > 0) {
-      start.wait();
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + paddle::str::to_string(i));
-      }
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
-      }
-      finish.wait();
-    }
-  });
-}
-
-TEST(CustomStackTrace, normalTest) {
-  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                    size_t countDown,
-                    size_t layerSize,
-                    paddle::ThreadBarrier& start,
-                    paddle::ThreadBarrier& finish) {
-    while (countDown-- > 0) {
-      start.wait();
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + paddle::str::to_string(i));
-      }
-      tracer.clear();  // in forward test, tracer will clear after forward.
-      finish.wait();
-    }
-  });
-}
diff --git a/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp b/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
deleted file mode 100644
index 78886a3ed..000000000
--- a/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/CustomStackTrace.h"
-#include "paddle/legacy/utils/StringUtil.h"
-#include "paddle/legacy/utils/Util.h"
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-
-  for (size_t i = 0; i < 1000; ++i) {
-    paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
-    if (i == 998) {
-      throw "Unhandle exception";
-    }
-  }
-
-  return 0;
-}
diff --git a/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh b/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
deleted file mode 100755
index b5543485f..000000000
--- a/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-echo "Test Custom Stack Trace print correct result when fail"
-./test_CustomStackTracePrint >customStackTraceLog 2>&1
-if [ $? -eq 0 ]; then
-  exit 1
-else
-  set -e
-  TEXT=""
-  for ((i=0; i<=998; i++))
-  do
-    TEXT="layer_$i, "$TEXT
-  done
-  TEXT="Forwarding "$TEXT
-  grep -q "$TEXT" customStackTraceLog
-fi
diff --git a/paddle/legacy/utils/tests/test_Error.cpp b/paddle/legacy/utils/tests/test_Error.cpp
deleted file mode 100644
index 250c4d58a..000000000
--- a/paddle/legacy/utils/tests/test_Error.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/Error.h"
-
-#include <gtest/gtest.h>
-
-TEST(Error, testAll) {
-  paddle::Error error;
-  ASSERT_TRUE(error.isOK());
-  error = paddle::Error("I'm the error");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("I'm the error", error.msg());
-
-  error = paddle::Error("error2");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("error2", error.msg());
-
-  int i = 3;
-  auto error3 = paddle::Error("error%d", i);
-  ASSERT_FALSE(error3.isOK());
-  ASSERT_STREQ("error3", error3.msg());
-}
diff --git a/paddle/legacy/utils/tests/test_SIMDFlags.cpp b/paddle/legacy/utils/tests/test_SIMDFlags.cpp
deleted file mode 100644
index 6362210ac..000000000
--- a/paddle/legacy/utils/tests/test_SIMDFlags.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-
-#include "paddle/legacy/utils/CpuId.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(SIMDFlags, gccTest) {
-#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
-    !defined(__arm__) && !defined(__aarch64__)
-  // clang-format off
-  CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
-  CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
-  CHECK(!__builtin_cpu_supports("sse3")   != HAS_SSE3);
-  CHECK(!__builtin_cpu_supports("ssse3")  != HAS_SSSE3);
-  CHECK(!__builtin_cpu_supports("sse4.1") != HAS_SSE41);
-  CHECK(!__builtin_cpu_supports("sse4.2") != HAS_SSE42);
-  CHECK(!__builtin_cpu_supports("avx")    != HAS_AVX);
-  CHECK(!__builtin_cpu_supports("avx2")   != HAS_AVX2);
-// clang-format on
-#endif
-}
-
-TEST(SIMDFlags, normalPrint) {
-  LOG(INFO) << "Has SSE:     " << std::boolalpha << HAS_SSE;
-  LOG(INFO) << "Has SSE2:    " << std::boolalpha << HAS_SSE2;
-  LOG(INFO) << "Has SSE3:    " << std::boolalpha << HAS_SSE3;
-  LOG(INFO) << "Has SSSE3:   " << std::boolalpha << HAS_SSSE3;
-  LOG(INFO) << "Has SSE4:    " << std::boolalpha << HAS_SSE41 || HAS_SSE42;
-  LOG(INFO) << "Has FMA3:    " << std::boolalpha << HAS_FMA3;
-  LOG(INFO) << "Has FMA4:    " << std::boolalpha << HAS_FMA4;
-  LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
-  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
-  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
-  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
-}
diff --git a/paddle/legacy/utils/tests/test_SpinLock.cpp b/paddle/legacy/utils/tests/test_SpinLock.cpp
deleted file mode 100644
index 4cd7836d6..000000000
--- a/paddle/legacy/utils/tests/test_SpinLock.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 100, "testing thread number");
-
-void testNormalImpl(
-    size_t thread_num,
-    const std::function<void(size_t, size_t&, paddle::SpinLock&)>& callback) {
-  paddle::SpinLock mutex;
-  std::vector<std::thread> threads;
-  threads.reserve(thread_num);
-
-  size_t count = 0;
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &count, &mutex, &callback] {
-      callback(thread_num, count, mutex);
-    });
-  }
-  for (auto& thread : threads) {
-    thread.join();
-  }
-  // Check whether all threads reach this point or not
-  CHECK_EQ(count, thread_num);
-}
-
-TEST(ThreadSpinLock, normalTest) {
-  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
-    testNormalImpl(
-        thread_num,
-        [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) {
-          std::lock_guard<paddle::SpinLock> lock(mutex);
-          ++count;
-        });
-  }
-}
diff --git a/paddle/legacy/utils/tests/test_StringUtils.cpp b/paddle/legacy/utils/tests/test_StringUtils.cpp
deleted file mode 100644
index 61d2815f0..000000000
--- a/paddle/legacy/utils/tests/test_StringUtils.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/legacy/utils/StringUtil.h"
-
-#include <gtest/gtest.h>
-
-TEST(StringUtil, to) {
-  ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
-  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
-  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
-}
diff --git a/paddle/legacy/utils/tests/test_Thread.cpp b/paddle/legacy/utils/tests/test_Thread.cpp
deleted file mode 100644
index 5e07da323..000000000
--- a/paddle/legacy/utils/tests/test_Thread.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/legacy/utils/Thread.h>
-#include <atomic>
-
-using paddle::AsyncThreadPool;  // NOLINT
-
-TEST(AsyncThreadPool, addJob) {
-  AsyncThreadPool pool(8);
-  auto a = pool.addJob([] { return 1; });
-  auto b = pool.addJob([] { return true; });
-  auto c = pool.addJob([] { return false; });
-
-  ASSERT_EQ(a.get(), 1);
-  ASSERT_TRUE(b.get());
-  ASSERT_FALSE(c.get());
-}
-
-TEST(AsyncThreadPool, addBatchJob) {
-  AsyncThreadPool pool(8);
-  std::atomic<int> counter{0};
-
-  std::vector<AsyncThreadPool::JobFunc> jobs;
-
-  for (int i = 0; i < 10000; i++) {
-    jobs.emplace_back([&] { counter++; });
-  }
-
-  pool.addBatchJobs(jobs);
-
-  ASSERT_EQ(counter, 10000);
-}
-
-TEST(AsyncThreadPool, multiThreadAddBatchJob) {
-  AsyncThreadPool levelOnePool(200);
-  AsyncThreadPool levelTwoPool(200);
-
-  std::shared_ptr<std::mutex> mut = std::make_shared<std::mutex>();
-  int counter = 0;
-  const int numMonitors = 300;
-  const int numSlaves = 300;
-  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
-    std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves, [mut, &counter] {
-      std::lock_guard<std::mutex> lk(*mut);
-      counter++;
-    });
-    levelTwoPool.addBatchJobs(slaveJobs);
-  });
-  levelOnePool.addBatchJobs(moniterJobs);
-  ASSERT_EQ(counter, numMonitors * numSlaves);
-}
-
-TEST(AsyncThreadPool, addBatchJobWithResults) {
-  AsyncThreadPool pool(100);
-
-  std::vector<std::function<int()>> jobs;
-  const int numJobs = 100;
-  for (int i = 0; i < numJobs; i++) {
-    jobs.emplace_back([i] { return i; });
-  }
-
-  std::vector<int> res;
-  pool.addBatchJobs(jobs, res);
-
-  for (int i = 0; i < numJobs; i++) {
-    ASSERT_EQ(res[i], i);
-  }
-}
diff --git a/paddle/legacy/utils/tests/test_ThreadBarrier.cpp b/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
deleted file mode 100644
index 9c8851ae2..000000000
--- a/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include "paddle/legacy/utils/Locks.h"
-#include "paddle/legacy/utils/Logging.h"
-#include "paddle/legacy/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 100, "testing thread number");
-
-void testNormalImpl(
-    size_t thread_num,
-    const std::function<void(size_t,
-                             std::mutex&,
-                             std::set<std::thread::id>&,
-                             paddle::ThreadBarrier&)>& callback) {
-  std::mutex mutex;
-  std::set<std::thread::id> tids;
-  paddle::ThreadBarrier barrier(thread_num);
-
-  std::vector<std::thread> threads;
-  threads.reserve(thread_num);
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] {
-      callback(thread_num, mutex, tids, barrier);
-    });
-  }
-
-  for (auto& thread : threads) {
-    thread.join();
-  }
-}
-
-TEST(ThreadBarrier, normalTest) {
-  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
-    testNormalImpl(thread_num,
-                   [](size_t thread_num,
-                      std::mutex& mutex,
-                      std::set<std::thread::id>& tids,
-                      paddle::ThreadBarrier& barrier) {
-                     {
-                       std::lock_guard<std::mutex> guard(mutex);
-                       tids.insert(std::this_thread::get_id());
-                     }
-                     barrier.wait();
-                     // Check whether all threads reach this point or not
-                     CHECK_EQ(tids.size(), thread_num);
-                   });
-  }
-}
-- 
GitLab


From ef038743f1c015b13287abcb87f7d63717f45b1b Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:34:39 +0800
Subject: [PATCH 2/9] remove legacy python code

---
 python/paddle/trainer/PyDataProvider2.py      |  541 --
 .../paddle/trainer/PyDataProviderWrapper.py   |  749 --
 python/paddle/trainer/__init__.py             |   13 -
 python/paddle/trainer/config_parser.py        | 4447 ----------
 .../paddle/trainer/config_parser_extension.py |   39 -
 python/paddle/trainer/recurrent_units.py      |  357 -
 .../paddle/trainer_config_helpers/__init__.py |   25 -
 .../trainer_config_helpers/activations.py     |  263 -
 python/paddle/trainer_config_helpers/attrs.py |  291 -
 .../config_parser_utils.py                    |   51 -
 .../trainer_config_helpers/data_sources.py    |  213 -
 .../default_decorators.py                     |  164 -
 .../trainer_config_helpers/evaluators.py      |  813 --
 .../trainer_config_helpers/layer_math.py      |  113 -
 .../paddle/trainer_config_helpers/layers.py   | 7610 -----------------
 .../paddle/trainer_config_helpers/networks.py | 1813 ----
 .../trainer_config_helpers/optimizers.py      |  447 -
 .../paddle/trainer_config_helpers/poolings.py |  148 -
 .../tests/CMakeLists.txt                      |   17 -
 .../tests/ProtobufEqualMain.cpp               |   59 -
 .../tests/configs/.gitignore                  |    1 -
 .../tests/configs/file_list.sh                |   17 -
 .../tests/configs/generate_protostr.sh        |   27 -
 .../tests/configs/img_layers.py               |   38 -
 .../tests/configs/img_trans_layers.py         |   38 -
 .../tests/configs/last_first_seq.py           |   35 -
 .../tests/configs/layer_activations.py        |   34 -
 .../tests/configs/math_ops.py                 |   42 -
 .../tests/configs/projections.py              |   80 -
 .../configs/protostr/img_layers.protostr      |  193 -
 .../protostr/img_trans_layers.protostr        |  193 -
 .../configs/protostr/last_first_seq.protostr  |  102 -
 .../protostr/layer_activations.protostr       |  423 -
 .../tests/configs/protostr/math_ops.protostr  |  413 -
 .../configs/protostr/projections.protostr     |  466 -
 .../tests/configs/protostr/shared_fc.protostr |  125 -
 .../configs/protostr/shared_gru.protostr      |  289 -
 .../configs/protostr/shared_lstm.protostr     |  385 -
 .../protostr/simple_rnn_layers.protostr       |  424 -
 .../protostr/test_BatchNorm3D.protostr        |   93 -
 .../protostr/test_bi_grumemory.protostr       |  155 -
 .../protostr/test_bilinear_interp.protostr    |  137 -
 .../configs/protostr/test_clip_layer.protostr |   31 -
 .../protostr/test_conv3d_layer.protostr       |  132 -
 .../protostr/test_cost_layers.protostr        |  375 -
 .../test_cost_layers_with_weight.protostr     |  162 -
 .../test_cross_entropy_over_beam.protostr     |  207 -
 .../protostr/test_deconv3d_layer.protostr     |  132 -
 .../test_detection_output_layer.protostr      |   66 -
 .../protostr/test_dot_prod_layer.protostr     |   38 -
 .../protostr/test_expand_layer.protostr       |   56 -
 .../test_factorization_machine.protostr       |   39 -
 .../tests/configs/protostr/test_fc.protostr   |   98 -
 .../protostr/test_gated_unit_layer.protostr   |  106 -
 .../protostr/test_grumemory_layer.protostr    |   51 -
 .../configs/protostr/test_hsigmoid.protostr   |   62 -
 .../test_kmax_seq_socre_layer.protostr        |   59 -
 .../protostr/test_l2_distance_layer.protostr  |   39 -
 .../protostr/test_lstmemory_layer.protostr    |   53 -
 .../configs/protostr/test_maxout.protostr     |  233 -
 .../test_multibox_loss_layer.protostr         |   79 -
 .../protostr/test_multiplex_layer.protostr    |   63 -
 .../configs/protostr/test_ntm_layers.protostr |  225 -
 .../tests/configs/protostr/test_pad.protostr  |  122 -
 .../protostr/test_pooling3D_layer.protostr    |  123 -
 .../protostr/test_prelu_layer.protostr        |  144 -
 .../protostr/test_print_layer.protostr        |   27 -
 .../protostr/test_recursive_topology.protostr |  593 --
 .../protostr/test_repeat_layer.protostr       |   42 -
 .../protostr/test_resize_layer.protostr       |   27 -
 .../configs/protostr/test_rnn_group.protostr  |  738 --
 .../protostr/test_roi_pool_layer.protostr     |  100 -
 .../configs/protostr/test_row_conv.protostr   |   41 -
 .../protostr/test_row_l2_norm_layer.protostr  |   27 -
 .../protostr/test_scale_shift_layer.protostr  |   72 -
 .../test_scale_sub_region_layer.protostr      |   51 -
 .../protostr/test_seq_concat_reshape.protostr |   51 -
 .../protostr/test_seq_slice_layer.protostr    |   79 -
 .../protostr/test_sequence_pooling.protostr   |  162 -
 .../configs/protostr/test_smooth_l1.protostr  |   40 -
 .../protostr/test_split_datasource.protostr   |   72 -
 .../configs/protostr/test_spp_layer.protostr  |   40 -
 .../test_sub_nested_seq_select_layer.protostr |   37 -
 .../configs/protostr/unused_layers.protostr   |   27 -
 .../configs/protostr/util_layers.protostr     |   87 -
 .../tests/configs/run_tests.sh                |   44 -
 .../tests/configs/shared_fc.py                |   43 -
 .../tests/configs/shared_gru.py               |   54 -
 .../tests/configs/shared_lstm.py              |   56 -
 .../tests/configs/simple_rnn_layers.py        |   51 -
 .../tests/configs/test_BatchNorm3D.py         |   25 -
 .../tests/configs/test_bi_grumemory.py        |   21 -
 .../tests/configs/test_bilinear_interp.py     |   41 -
 .../tests/configs/test_clip_layer.py          |   20 -
 .../test_config_parser_for_non_file_config.py |   51 -
 .../tests/configs/test_conv3d_layer.py        |   63 -
 .../tests/configs/test_cost_layers.py         |   61 -
 .../configs/test_cost_layers_with_weight.py   |   33 -
 .../tests/configs/test_crop.py                |   35 -
 .../configs/test_cross_entropy_over_beam.py   |   45 -
 .../tests/configs/test_deconv3d_layer.py      |   64 -
 .../configs/test_detection_output_layer.py    |   37 -
 .../tests/configs/test_dot_prod_layer.py      |   21 -
 .../tests/configs/test_expand_layer.py        |   28 -
 .../configs/test_factorization_machine.py     |   21 -
 .../tests/configs/test_fc.py                  |   30 -
 .../tests/configs/test_gated_unit_layer.py    |   30 -
 .../tests/configs/test_grumemory_layer.py     |   27 -
 .../tests/configs/test_hsigmoid.py            |   22 -
 .../configs/test_kmax_seq_socre_layer.py      |    9 -
 .../tests/configs/test_l2_distance_layer.py   |   21 -
 .../tests/configs/test_lstmemory_layer.py     |   27 -
 .../tests/configs/test_maxout.py              |   56 -
 .../tests/configs/test_multibox_loss_layer.py |   39 -
 .../tests/configs/test_multiplex_layer.py     |   26 -
 .../tests/configs/test_ntm_layers.py          |   44 -
 .../tests/configs/test_pad.py                 |   34 -
 .../tests/configs/test_pooling3D_layer.py     |   52 -
 .../tests/configs/test_prelu_layer.py         |   24 -
 .../tests/configs/test_print_layer.py         |   23 -
 .../tests/configs/test_recursive_topology.py  |   30 -
 .../tests/configs/test_repeat_layer.py        |   25 -
 .../tests/configs/test_resize_layer.py        |   20 -
 .../tests/configs/test_rnn_group.py           |   62 -
 .../tests/configs/test_roi_pool_layer.py      |   37 -
 .../tests/configs/test_row_conv.py            |   23 -
 .../tests/configs/test_row_l2_norm_layer.py   |   20 -
 .../tests/configs/test_scale_shift_layer.py   |   23 -
 .../configs/test_scale_sub_region_layer.py    |   25 -
 .../tests/configs/test_seq_concat_reshape.py  |   26 -
 .../tests/configs/test_seq_slice_layer.py     |   13 -
 .../tests/configs/test_sequence_pooling.py    |   43 -
 .../tests/configs/test_smooth_l1.py           |   21 -
 .../tests/configs/test_split_datasource.py    |   24 -
 .../tests/configs/test_spp_layer.py           |   24 -
 .../test_sub_nested_seq_select_layer.py       |   11 -
 .../tests/configs/unused_layers.py            |   25 -
 .../tests/configs/util_layers.py              |   27 -
 .../tests/layers_test.py                      |   20 -
 .../tests/layers_test_config.py               |   86 -
 .../tests/test_reset_hook.py                  |   29 -
 python/paddle/trainer_config_helpers/utils.py |   33 -
 python/paddle/v2/__init__.py                  |  156 -
 python/paddle/v2/activation.py                |   26 -
 python/paddle/v2/attr.py                      |   29 -
 python/paddle/v2/config_base.py               |   68 -
 python/paddle/v2/data_feeder.py               |  133 -
 python/paddle/v2/data_type.py                 |   27 -
 python/paddle/v2/dataset/__init__.py          |   46 -
 python/paddle/v2/dataset/cifar.py             |  148 -
 python/paddle/v2/dataset/common.py            |  236 -
 python/paddle/v2/dataset/conll05.py           |  257 -
 python/paddle/v2/dataset/flowers.py           |  218 -
 python/paddle/v2/dataset/imdb.py              |  148 -
 python/paddle/v2/dataset/imikolov.py          |  161 -
 python/paddle/v2/dataset/mnist.py             |  129 -
 python/paddle/v2/dataset/movielens.py         |  262 -
 python/paddle/v2/dataset/mq2007.py            |  333 -
 python/paddle/v2/dataset/sentiment.py         |  141 -
 python/paddle/v2/dataset/tests/cifar_test.py  |   56 -
 python/paddle/v2/dataset/tests/common_test.py |   94 -
 .../paddle/v2/dataset/tests/flowers_test.py   |   51 -
 python/paddle/v2/dataset/tests/imdb_test.py   |   57 -
 .../paddle/v2/dataset/tests/imikolov_test.py  |   67 -
 python/paddle/v2/dataset/tests/mnist_test.py  |   44 -
 python/paddle/v2/dataset/tests/mq2007_test.py |   33 -
 .../paddle/v2/dataset/tests/test_sentiment.py |   55 -
 .../paddle/v2/dataset/tests/voc2012_test.py   |   42 -
 python/paddle/v2/dataset/tests/wmt16_test.py  |   66 -
 python/paddle/v2/dataset/uci_housing.py       |  134 -
 python/paddle/v2/dataset/voc2012.py           |   85 -
 python/paddle/v2/dataset/wmt14.py             |  181 -
 python/paddle/v2/dataset/wmt16.py             |  352 -
 python/paddle/v2/evaluator.py                 |   36 -
 python/paddle/v2/event.py                     |  113 -
 python/paddle/v2/image.py                     |  380 -
 python/paddle/v2/inference.py                 |  172 -
 python/paddle/v2/layer.py                     |  326 -
 python/paddle/v2/master/.gitignore            |    3 -
 python/paddle/v2/master/__init__.py           |   17 -
 python/paddle/v2/master/client.py             |   95 -
 python/paddle/v2/minibatch.py                 |   43 -
 python/paddle/v2/networks.py                  |   33 -
 python/paddle/v2/op.py                        |  120 -
 python/paddle/v2/optimizer.py                 |  297 -
 python/paddle/v2/parameters.py                |  441 -
 python/paddle/v2/plot/__init__.py             |   17 -
 python/paddle/v2/plot/plot.py                 |   82 -
 python/paddle/v2/plot/tests/CMakeLists.txt    |    5 -
 python/paddle/v2/plot/tests/__init__.py       |   16 -
 python/paddle/v2/plot/tests/test_ploter.py    |   40 -
 python/paddle/v2/pooling.py                   |   26 -
 python/paddle/v2/reader/__init__.py           |   74 -
 python/paddle/v2/reader/creator.py            |  130 -
 python/paddle/v2/reader/decorator.py          |  405 -
 python/paddle/v2/reader/tests/CMakeLists.txt  |    2 -
 python/paddle/v2/reader/tests/__init__.py     |   13 -
 python/paddle/v2/reader/tests/creator_test.py |   74 -
 .../paddle/v2/reader/tests/decorator_test.py  |  178 -
 .../v2/reader/tests/test_data_creator.txt     |    3 -
 .../v2/reader/tests/test_reader_recordio.dat  |  Bin 76 -> 0 bytes
 .../v2/reader/tests/test_recordio_creator.dat |  Bin 88 -> 0 bytes
 python/paddle/v2/tests/CMakeLists.txt         |    8 -
 python/paddle/v2/tests/cat.jpg                |  Bin 57218 -> 0 bytes
 python/paddle/v2/tests/test_data_feeder.py    |  267 -
 python/paddle/v2/tests/test_image.py          |   43 -
 python/paddle/v2/tests/test_layer.py          |  290 -
 python/paddle/v2/tests/test_op.py             |   51 -
 .../paddle/v2/tests/test_paramconf_order.py   |   99 -
 python/paddle/v2/tests/test_parameters.py     |  143 -
 python/paddle/v2/tests/test_rnn_layer.py      |  166 -
 python/paddle/v2/tests/test_topology.py       |   85 -
 python/paddle/v2/topology.py                  |  145 -
 python/paddle/v2/trainer.py                   |  258 -
 214 files changed, 37347 deletions(-)
 delete mode 100644 python/paddle/trainer/PyDataProvider2.py
 delete mode 100644 python/paddle/trainer/PyDataProviderWrapper.py
 delete mode 100644 python/paddle/trainer/__init__.py
 delete mode 100644 python/paddle/trainer/config_parser.py
 delete mode 100644 python/paddle/trainer/config_parser_extension.py
 delete mode 100644 python/paddle/trainer/recurrent_units.py
 delete mode 100644 python/paddle/trainer_config_helpers/__init__.py
 delete mode 100644 python/paddle/trainer_config_helpers/activations.py
 delete mode 100644 python/paddle/trainer_config_helpers/attrs.py
 delete mode 100644 python/paddle/trainer_config_helpers/config_parser_utils.py
 delete mode 100644 python/paddle/trainer_config_helpers/data_sources.py
 delete mode 100644 python/paddle/trainer_config_helpers/default_decorators.py
 delete mode 100644 python/paddle/trainer_config_helpers/evaluators.py
 delete mode 100644 python/paddle/trainer_config_helpers/layer_math.py
 delete mode 100644 python/paddle/trainer_config_helpers/layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/networks.py
 delete mode 100644 python/paddle/trainer_config_helpers/optimizers.py
 delete mode 100644 python/paddle/trainer_config_helpers/poolings.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/CMakeLists.txt
 delete mode 100644 python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/.gitignore
 delete mode 100755 python/paddle/trainer_config_helpers/tests/configs/file_list.sh
 delete mode 100755 python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/img_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/math_ops.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/projections.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
 delete mode 100755 python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_crop.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_fc.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_pad.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/configs/util_layers.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/layers_test.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/layers_test_config.py
 delete mode 100644 python/paddle/trainer_config_helpers/tests/test_reset_hook.py
 delete mode 100644 python/paddle/trainer_config_helpers/utils.py
 delete mode 100644 python/paddle/v2/__init__.py
 delete mode 100644 python/paddle/v2/activation.py
 delete mode 100644 python/paddle/v2/attr.py
 delete mode 100644 python/paddle/v2/config_base.py
 delete mode 100644 python/paddle/v2/data_feeder.py
 delete mode 100644 python/paddle/v2/data_type.py
 delete mode 100644 python/paddle/v2/dataset/__init__.py
 delete mode 100644 python/paddle/v2/dataset/cifar.py
 delete mode 100644 python/paddle/v2/dataset/common.py
 delete mode 100644 python/paddle/v2/dataset/conll05.py
 delete mode 100644 python/paddle/v2/dataset/flowers.py
 delete mode 100644 python/paddle/v2/dataset/imdb.py
 delete mode 100644 python/paddle/v2/dataset/imikolov.py
 delete mode 100644 python/paddle/v2/dataset/mnist.py
 delete mode 100644 python/paddle/v2/dataset/movielens.py
 delete mode 100644 python/paddle/v2/dataset/mq2007.py
 delete mode 100644 python/paddle/v2/dataset/sentiment.py
 delete mode 100644 python/paddle/v2/dataset/tests/cifar_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/common_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/flowers_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/imdb_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/imikolov_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/mnist_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/mq2007_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/test_sentiment.py
 delete mode 100644 python/paddle/v2/dataset/tests/voc2012_test.py
 delete mode 100644 python/paddle/v2/dataset/tests/wmt16_test.py
 delete mode 100644 python/paddle/v2/dataset/uci_housing.py
 delete mode 100644 python/paddle/v2/dataset/voc2012.py
 delete mode 100644 python/paddle/v2/dataset/wmt14.py
 delete mode 100644 python/paddle/v2/dataset/wmt16.py
 delete mode 100644 python/paddle/v2/evaluator.py
 delete mode 100644 python/paddle/v2/event.py
 delete mode 100644 python/paddle/v2/image.py
 delete mode 100644 python/paddle/v2/inference.py
 delete mode 100644 python/paddle/v2/layer.py
 delete mode 100644 python/paddle/v2/master/.gitignore
 delete mode 100644 python/paddle/v2/master/__init__.py
 delete mode 100644 python/paddle/v2/master/client.py
 delete mode 100644 python/paddle/v2/minibatch.py
 delete mode 100644 python/paddle/v2/networks.py
 delete mode 100644 python/paddle/v2/op.py
 delete mode 100644 python/paddle/v2/optimizer.py
 delete mode 100644 python/paddle/v2/parameters.py
 delete mode 100644 python/paddle/v2/plot/__init__.py
 delete mode 100644 python/paddle/v2/plot/plot.py
 delete mode 100644 python/paddle/v2/plot/tests/CMakeLists.txt
 delete mode 100644 python/paddle/v2/plot/tests/__init__.py
 delete mode 100644 python/paddle/v2/plot/tests/test_ploter.py
 delete mode 100644 python/paddle/v2/pooling.py
 delete mode 100644 python/paddle/v2/reader/__init__.py
 delete mode 100644 python/paddle/v2/reader/creator.py
 delete mode 100644 python/paddle/v2/reader/decorator.py
 delete mode 100644 python/paddle/v2/reader/tests/CMakeLists.txt
 delete mode 100644 python/paddle/v2/reader/tests/__init__.py
 delete mode 100644 python/paddle/v2/reader/tests/creator_test.py
 delete mode 100644 python/paddle/v2/reader/tests/decorator_test.py
 delete mode 100644 python/paddle/v2/reader/tests/test_data_creator.txt
 delete mode 100644 python/paddle/v2/reader/tests/test_reader_recordio.dat
 delete mode 100644 python/paddle/v2/reader/tests/test_recordio_creator.dat
 delete mode 100644 python/paddle/v2/tests/CMakeLists.txt
 delete mode 100644 python/paddle/v2/tests/cat.jpg
 delete mode 100644 python/paddle/v2/tests/test_data_feeder.py
 delete mode 100644 python/paddle/v2/tests/test_image.py
 delete mode 100644 python/paddle/v2/tests/test_layer.py
 delete mode 100644 python/paddle/v2/tests/test_op.py
 delete mode 100644 python/paddle/v2/tests/test_paramconf_order.py
 delete mode 100644 python/paddle/v2/tests/test_parameters.py
 delete mode 100644 python/paddle/v2/tests/test_rnn_layer.py
 delete mode 100644 python/paddle/v2/tests/test_topology.py
 delete mode 100644 python/paddle/v2/topology.py
 delete mode 100644 python/paddle/v2/trainer.py

diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
deleted file mode 100644
index 05635833b..000000000
--- a/python/paddle/trainer/PyDataProvider2.py
+++ /dev/null
@@ -1,541 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cPickle
-import logging
-import collections
-import functools
-import itertools
-
-logging.basicConfig(format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
-                    " %(message)s")
-
-
-class SequenceType(object):
-    NO_SEQUENCE = 0
-    SEQUENCE = 1
-    SUB_SEQUENCE = 2
-
-    @classmethod
-    def tostring(cls, value):
-        for k in cls.__dict__:
-            if not k.startswith('__'):
-                if getattr(cls, k) == value:
-                    return cls.__name__ + '.' + k
-        return 'INVALID(' + str(value) + ')'
-
-
-# TODO(yuyang18): Add string data type here.
-class DataType(object):
-    Dense = 0
-    SparseNonValue = 1
-    SparseValue = 2
-    Index = 3
-
-    @classmethod
-    def tostring(cls, value):
-        for k in cls.__dict__:
-            if not k.startswith('__'):
-                if getattr(cls, k) == value:
-                    return cls.__name__ + '.' + k
-        return 'INVALID(' + str(value) + ')'
-
-
-class CacheType(object):
-    NO_CACHE = 0  # No cache at all
-
-    # First pass, read data from python.  And store them in memory. Read from
-    # memory during rest passes.
-    CACHE_PASS_IN_MEM = 1
-
-
-class InputType(object):
-    """
-    InputType is the base class for paddle input types.
-
-    ..  note::
-
-        this is a base class, and should never be used by user.
-
-    :param dim: dimension of input. If the input is an integer, it means the
-                value range. Otherwise, it means the size of layer.
-    :type dim: int
-    :param seq_type: sequence type of input. 0 means it is not a sequence. 1
-                     means it is a variable length sequence. 2 means it is a
-                     nested sequence.
-    :type seq_type: int
-    :param type: data type of input.
-    :type type: int
-    """
-    __slots__ = ['dim', 'seq_type', 'type']
-
-    def __init__(self, dim, seq_type, tp):
-        self.dim = dim
-        self.seq_type = seq_type
-        self.type = tp
-
-    def __repr__(self):
-        """
-        Return a human readable representation like 'InputType(dim=25921, 
-            seq_type=SequenceType.NO_SEQUENCE, type=DataType.Dense)'
-        """
-        repr_str = type(self).__name__
-        repr_str += '('
-        serialize_func_map = {
-            'dim': repr,
-            'seq_type': SequenceType.tostring,
-            'type': DataType.tostring
-        }
-        for idx, k in enumerate(self.__slots__):
-            if idx != 0:
-                repr_str += ', '
-            repr_str += (
-                k + '=' + serialize_func_map.get(k, repr)(getattr(self, k)))
-        repr_str += ')'
-        return repr_str
-
-
-def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
-    """
-    Dense Array. It means the input feature is dense array with float type.
-    For example, if the input is an image with 28*28 pixels, the input of
-    Paddle neural network could be a dense vector with dimension 784 or a
-    numpy array with shape (28, 28).
-
-    For the 2-D convolution operation, each sample in one mini-batch must have
-    the similarly size in PaddlePaddle now. But, it supports variable-dimension
-    feature across mini-batch. For the variable-dimension, the param dim is not
-    used. While the data reader must yield numpy array and the data feeder will
-    set the data shape correctly.
-
-    :param dim: dimension of this vector.
-    :type dim: int
-    :param seq_type: sequence type of input.
-    :type seq_type: int
-    :return: An input type object.
-    :rtype: InputType
-    """
-    return InputType(dim, seq_type, DataType.Dense)
-
-
-def sparse_non_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
-    """
-    Sparse binary vector. It means the input feature is a sparse vector and the
-    every element in this vector is either zero or one.
-
-    :param dim: dimension of this vector.
-    :type dim: int
-    :param seq_type: sequence type of this input.
-    :type seq_type: int
-    :return: An input type object.
-    :rtype: InputType
-    """
-    return InputType(dim, seq_type, DataType.SparseNonValue)
-
-
-def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
-    """
-    Sparse vector. It means the input feature is a sparse vector. Most of the
-    elements in this vector are zero, others could be any float value.
-
-    :param dim: dimension of this vector.
-    :type dim: int
-    :param seq_type: sequence type of this input.
-    :type seq_type: int
-    :return: An input type object.
-    :rtype: InputType
-    """
-    return InputType(dim, seq_type, DataType.SparseValue)
-
-
-def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
-    """
-    Data type of integer.
-
-    :param seq_type: sequence type of this input.
-    :type seq_type: int
-    :param value_range: range of this integer.
-    :type value_range: int
-    :return: An input type object
-    :rtype: InputType
-    """
-    return InputType(value_range, seq_type, DataType.Index)
-
-
-dense_vector = dense_slot
-sparse_binary_vector = sparse_non_value_slot
-sparse_float_vector = sparse_value_slot
-integer_value = index_slot
-
-# dense_array can be used for variable-length input feature.
-# Each feature is not a vector, but a multi-dimensional array.
-dense_array = dense_slot
-
-
-def dense_vector_sequence(dim):
-    """
-    Data type of a sequence of dense vector.
-
-    :param dim: dimension of dense vector.
-    :type dim: int
-    :return: An input type object
-    :rtype: InputType
-    """
-    return dense_vector(dim, seq_type=SequenceType.SEQUENCE)
-
-
-def dense_vector_sub_sequence(dim):
-    return dense_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
-
-
-def sparse_binary_vector_sequence(dim):
-    """
-    Data type of a sequence of sparse vector, which every element is either zero
-     or one.
-
-    :param dim: dimension of sparse vector.
-    :type dim: int
-    :return: An input type object
-    :rtype: InputType
-    """
-    return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE)
-
-
-def sparse_binary_vector_sub_sequence(dim):
-    return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
-
-
-def sparse_float_vector_sequence(dim):
-    """
-    Data type of a sequence of sparse vector, which most elements are zero,
-    others could be any float value.
-
-    :param dim: dimension of sparse vector.
-    :type dim: int
-    :return: An input type object
-    :rtype: InputType
-    """
-    return sparse_float_vector(dim, seq_type=SequenceType.SEQUENCE)
-
-
-def sparse_float_vector_sub_sequence(dim):
-    return sparse_float_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
-
-
-def integer_value_sequence(value_range):
-    """
-    Data type of a sequence of integer.
-
-    :param value_range: range of each element.
-    :type value_range: int
-    """
-    return integer_value(value_range, seq_type=SequenceType.SEQUENCE)
-
-
-def integer_value_sub_sequence(dim):
-    return integer_value(dim, seq_type=SequenceType.SUB_SEQUENCE)
-
-
-integer_sequence = integer_value_sequence
-
-
-class SingleSlotWrapper(object):
-    def __init__(self, generator):
-        self.generator = generator
-
-    def __call__(self, obj, filename):
-        for item in self.generator(obj, filename):
-            if isinstance(item, dict):
-                yield item
-            else:
-                yield [item]
-
-
-class InputOrderWrapper(object):
-    def __init__(self, generator, input_order):
-        self.generator = generator
-        self.input_order = input_order
-
-    def __call__(self, obj, filename):
-        for item in self.generator(obj, filename):
-            if isinstance(item, dict):
-                yield [
-                    item.get(input_name, None)
-                    for input_name in self.input_order
-                ]
-            else:
-                yield item
-
-
-class CheckWrapper(object):
-    def __init__(self, generator, input_types, check_fail_continue, logger):
-        self.generator = generator
-        self.input_types = input_types
-        self.check_fail_continue = check_fail_continue
-        self.logger = logger
-
-    def __call__(self, obj, filename):
-        for items in self.generator(obj, filename):
-            try:
-                assert len(items) == len(self.input_types)
-                assert len(filter(lambda x: x is None, items)) == 0
-                for item, input_type in itertools.izip(items, self.input_types):
-                    callback = functools.partial(CheckWrapper.loop_callback,
-                                                 input_type)
-
-                    for _ in xrange(input_type.seq_type):
-                        callback = functools.partial(CheckWrapper.loop_check,
-                                                     callback)
-                    callback(item)
-
-                yield items
-            except AssertionError as e:
-                self.logger.warning(
-                    "Item (%s) is not fit the input type with error %s" %
-                    (repr(item), repr(e)))
-
-                if self.check_fail_continue:
-                    continue
-                else:
-                    raise
-
-    @staticmethod
-    def loop_callback(input_type, each):
-        assert isinstance(input_type, InputType)
-        if input_type.type == DataType.Dense:
-            assert isinstance(each, collections.Sequence)
-            for d in each:
-                assert isinstance(d, float)
-            assert len(each) == input_type.dim
-        elif input_type.type == DataType.Index:
-            assert isinstance(each, int)
-            assert each < input_type.dim
-        elif input_type.type == DataType.SparseNonValue \
-                or input_type.type == DataType.SparseValue:
-            assert isinstance(each, collections.Sequence)
-            sparse_id = set()
-            for k in each:
-                if input_type.type == DataType.SparseValue:
-                    k, v = k
-                    assert isinstance(v, float)
-                assert isinstance(k, int)
-                assert k < input_type.dim
-                sparse_id.add(k)
-            assert len(sparse_id) == len(each)
-        else:
-            raise RuntimeError("Not support input type")
-
-    @staticmethod
-    def loop_check(callback, item):
-        for each in item:
-            callback(each)
-
-
-class CheckInputTypeWrapper(object):
-    def __init__(self, generator, input_types, logger):
-        self.generator = generator
-        self.input_types = input_types
-        self.logger = logger
-
-    def __call__(self, obj, filename):
-        for items in self.generator(obj, filename):
-            try:
-                # dict type is required for input_types when item is dict type
-                assert (isinstance(items, dict) and \
-                        not isinstance(self.input_types, dict))==False
-                yield items
-            except AssertionError as e:
-                self.logger.error(
-                    "%s type is required for input type but got %s" %
-                    (repr(type(items)), repr(type(self.input_types))))
-                raise
-
-
-def provider(input_types=None,
-             should_shuffle=None,
-             pool_size=-1,
-             min_pool_size=-1,
-             can_over_batch_size=True,
-             calc_batch_size=None,
-             cache=CacheType.NO_CACHE,
-             check=False,
-             check_fail_continue=False,
-             init_hook=None,
-             **outter_kwargs):
-    """
-    Provider decorator. Use it to make a function into PyDataProvider2 object.
-    In this function, user only need to get each sample for some train/test
-    file.
-
-    The basic usage is:
-
-    ..  code-block:: python
-
-        @provider(some data provider config here...)
-        def process(settings, file_name):
-            while not at end of file_name:
-                sample = readOneSampleFromFile(file_name)
-                yield sample.
-
-    The configuration of data provider should be setup by\:
-
-    :param input_types: Specify the input types, can also be set in init_hook.
-                        It could be a list of InputType object. For example,
-                        input_types=[dense_vector(9), integer_value(2)]. Or user
-                        can set a dict of InputType object, which key is
-                        data_layer's name. For example, input_types=\
-                        {'img': img_features, 'label': label}. when using dict of
-                        InputType, user could yield a dict of feature values, which
-                        key is also data_layer's name.
-
-    :type input_types: list|tuple|dict
-
-    :param should_shuffle: True if data should shuffle. Pass None means shuffle
-                           when is training and not to shuffle when is testing.
-    :type should_shuffle: bool
-
-    :param pool_size: Max number of sample in data pool.
-    :type pool_size: int
-
-    :param min_pool_size: Set minimal sample in data pool. The PaddlePaddle will
-                          random pick sample in pool. So the min_pool_size
-                          effect the randomize of data.
-    :type min_pool_size: int
-
-    :param can_over_batch_size: True if paddle can return a mini-batch larger
-                                than batch size in settings. It is useful when
-                                custom calculate one sample's batch_size.
-
-                                It is very danger to set it to false and use
-                                calc_batch_size together. Default is true.
-    :type can_over_batch_size: bool
-
-    :param calc_batch_size: a method to calculate each sample's batch size.
-                            Default each sample's batch size is 1. But to you
-                            can customize each sample's batch size.
-    :type calc_batch_size: callable
-
-    :param cache: Cache strategy of Data Provider. Default is CacheType.NO_CACHE
-    :type cache: int
-
-    :param init_hook: Initialize hook. Useful when data provider need load some
-                      external data like dictionary. The parameter is
-                      (settings, file_list, \*\*kwargs).
-
-                      - settings. It is the global settings object. User can set
-                        settings.input_types here.
-                      - file_list. All file names for passed to data provider.
-                      - is_train. Is this data provider used for training or not.
-                      - kwargs. Other keyword arguments passed from
-                        trainer_config's args parameter.
-    :type init_hook: callable
-
-    :param check: Check the yield data format is as same as input_types. Enable
-                  this will make data provide process slow but it is very useful
-                  for debug. Default is disabled.
-    :type check: bool
-
-    :param check_fail_continue: Continue train or not when check failed. Just
-                                drop the wrong format data when it is True. Has
-                                no effect when check set to False.
-    :type check_fail_continue: bool
-    """
-
-    def __wrapper__(generator):
-        class DataProvider(object):
-            def __init__(self, file_list, **kwargs):
-                self.logger = logging.getLogger("")
-                self.logger.setLevel(logging.INFO)
-                self.input_types = None
-                self.should_shuffle = should_shuffle
-
-                true_table = [1, 't', 'true', 'on']
-                false_table = [0, 'f', 'false', 'off']
-                if not isinstance(self.should_shuffle, bool) and \
-                                self.should_shuffle is not None:
-
-                    if isinstance(self.should_shuffle, basestring):
-                        self.should_shuffle = self.should_shuffle.lower()
-
-                    if self.should_shuffle in true_table:
-                        self.should_shuffle = True
-                    elif self.should_shuffle in false_table:
-                        self.should_shuffle = False
-                    else:
-                        self.logger.warning(
-                            "Could not recognize should_shuffle (%s), "
-                            "just use default value of should_shuffle."
-                            " Please set should_shuffle to bool value or "
-                            "something in %s" %
-                            (repr(self.should_shuffle),
-                             repr(true_table + false_table)))
-                        self.should_shuffle = None
-
-                self.pool_size = pool_size
-                self.can_over_batch_size = can_over_batch_size
-                self.calc_batch_size = calc_batch_size
-                self.file_list = file_list
-                self.generator = generator
-                self.cache = cache
-                self.min_pool_size = min_pool_size
-                self.input_order = kwargs['input_order']
-                self.check = check
-                if init_hook is not None:
-                    init_hook(self, file_list=file_list, **kwargs)
-
-                if 'slots' in outter_kwargs:
-                    self.logger.warning('setting slots value is deprecated, '
-                                        'please use input_types instead.')
-                    self.slots = outter_kwargs['slots']
-                if input_types is not None:
-                    self.slots = input_types
-
-                if self.input_types is not None:
-                    self.slots = self.input_types
-
-                assert self.slots is not None, \
-                    "Data Provider's input_types must be set"
-                assert self.generator is not None
-
-                use_dynamic_order = False
-                if isinstance(self.slots, dict):  # reorder input_types
-                    self.slots = [self.slots[ipt] for ipt in self.input_order]
-                    use_dynamic_order = True
-
-                if len(self.slots) == 1:
-                    self.generator = SingleSlotWrapper(self.generator)
-
-                if use_dynamic_order:
-                    self.generator = InputOrderWrapper(self.generator,
-                                                       self.input_order)
-                else:
-                    self.generator = CheckInputTypeWrapper(
-                        self.generator, self.slots, self.logger)
-                if self.check:
-                    self.generator = CheckWrapper(self.generator, self.slots,
-                                                  check_fail_continue,
-                                                  self.logger)
-
-        return DataProvider
-
-    return __wrapper__
-
-
-def deserialize_args(args):
-    """
-    Internal use only.
-    :param args:
-    :return:
-    """
-    return cPickle.loads(args)
diff --git a/python/paddle/trainer/PyDataProviderWrapper.py b/python/paddle/trainer/PyDataProviderWrapper.py
deleted file mode 100644
index 374976db9..000000000
--- a/python/paddle/trainer/PyDataProviderWrapper.py
+++ /dev/null
@@ -1,749 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module provide a wrapper(decorator) to wrap a data process method into a
-PyDataProvider. Some examples are shown `here <data_provider/python_case.html>`_.
-"""
-
-import struct
-import array
-import random
-import gc
-import logging
-import pstats
-import sys
-import numpy
-import functools
-
-__all__ = [
-    'DenseSlot', 'SlotType', 'SparseNonValueSlot', 'StringSlot',
-    'SparseValueSlot', 'IndexSlot', 'PoolSize', 'GeneralPyDataProvider',
-    'provider', 'init_hook_wrapper'
-]
-
-try:  # Just for profile mode, will try to import cProfile first.
-    # Most python will contains cProfile, cProfile/profile are basically same.
-    # ref: https://docs.python.org/2/library/profile.html#introduction-to-the-profilers
-    import cProfile as profile
-except ImportError:
-    import profile
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import six.moves.cPickle as pickle
-
-import io
-
-
-class SlotType(object):  # Just a hint for user.
-    pass
-
-
-class DenseSlot(SlotType):
-    """
-    Dense Slot Type: Each item is the value of a Dense Vector.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: [float, float, ... ]
-    - **Seq**: [[float, float, ...], [float, float ....], ... ]
-    - **SubSeq**: [[[float, float, ...], [float ....], ...] ,  \
-                   [[float, float, ...], [float ....], ...] , ...]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension
-        :type dim: int
-        """
-        self.dim = dim
-        self.type = 0
-
-
-class SparseNonValueSlot(SlotType):
-    """
-    Sparse NonValue Slot Type: Each item is the id of a Sparse Vector.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: [int, int, ...]
-    - **Seq**: [[int, int, ...], [int, int, ...], ... ]
-    - **SubSeq**: [[[int, int, ...], [int, ....], ...] ,  \
-                   [[int, int, ...], [int, ....], ...] , ...]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension
-        :type dim: int
-        """
-        self.dim = dim
-        self.type = 1
-
-
-class SparseValueSlot(SlotType):
-    """
-    Sparse Value Slot Type: Each item is the id and value of a Sparse Vector.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: [(int, float), (int, float), ... ]
-    - **Seq**: [[(int,float), (int, float), ... ], \
-                [(int, float), (int, float), ...], ... ]
-    - **SubSeq**: [[[(int,float), ...], [(int, float), ....], ...] ,  \
-                   [[(int,float), ...], [(int, float), ....], ...] , ...]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension.
-        :type dim: int
-        """
-        self.dim = dim
-        self.type = 2
-
-
-class IndexSlot(SlotType):
-    """
-    Index Value Slot Type: Each item is the id of Label.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: int
-    - **Seq**:  [int, int, ....]
-    - **SubSeq**: [[int, int, ...], [int, int, ...], ... ]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension
-        :type dim: int
-        """
-        self.dim = dim
-        self.type = 3
-
-
-class StringSlot(SlotType):
-    """
-    String Value Slot Type: Each item is a string for printout, \
-                            can be used in DataLayer too.
-
-    Its yield format for :code:`provider` is:
-
-    - **NonSeq**: string
-    - **Seq**: [string, string, ....]
-    - **SubSeq**:  [[string, string, ...], [string, string, ...], ... ]
-    """
-
-    def __init__(self, dim):
-        """
-        :param dim: slot dimension
-        :type dim: string
-        """
-        self.dim = dim
-        self.type = 6
-
-
-class SparseNonValueHandler(object):
-    """
-    Private Class, Use for converting python object to paddle string.
-    """
-
-    def __init__(self):
-        self.offsets = []
-        self.value = []
-        self.offset_count = 0
-
-    def __call__(self, ele):
-        """
-        It will be invoked when scan each sparse data.
-
-        :param ele: list of sparse data, maybe non-value [ idx, ... ] or value.
-                    [ (idx, val), ... ]
-        :type ele: list
-        """
-        self.offsets.append(self.offset_count)
-        self.offset_count += len(ele)
-        self.processElement(ele)
-
-    def processElement(self, ele):
-        """
-        Process for element list. See __call__ for more document.
-        """
-        self.value += ele
-
-    def done(self, data_stream, int_packer):
-        """
-        Dump data to stream.
-        :param data_stream: Output Stream.
-        :param int_packer:  A struct.Struct("i") object
-        """
-        data_stream.write(array.array("i", self.offsets).tostring())
-        data_stream.write(int_packer.pack(self.offset_count))
-        data_stream.write(array.array("i", self.value).tostring())
-
-
-class SparseValueHandler(SparseNonValueHandler):
-    """
-    Private class, use for converting python obj to paddle string.
-    """
-
-    def __init__(self):
-        SparseNonValueHandler.__init__(self)
-        self.weight = []
-
-    def processElement(self, ele):
-        for idx, w in ele:
-            self.value.append(idx)
-            self.weight.append(w)
-
-    def done(self, data_stream, int_packer):
-        SparseNonValueHandler.done(self, data_stream, int_packer)
-        data_stream.write(int_packer.pack(self.offset_count))
-        data_stream.write(array.array("f", self.weight).tostring())
-
-
-class StringHandler(object):
-    """
-    Private Class, Use for converting python object to paddle string.
-    """
-
-    def __init__(self, data_stream, int_packer):
-        self.data_stream = data_stream
-        self.int_packer = int_packer
-
-    def __call__(self, ele):
-        """
-        It will be invoked when scan each string data.
-        :param ele: string data
-        :type ele: str
-        """
-        self.data_stream.write(self.int_packer.pack(len(ele)))
-        self.data_stream.write(array.array("c", ele).tostring())
-
-
-class GeneralPyDataProvider:
-    def __init__(self, *file_list, **kwargs):
-        """
-        :param file_list: input file_list
-        """
-        del kwargs  # unused
-        gc.disable()
-        assert isinstance(self.logger, logging.Logger)
-        self.use_seq_flag = hasattr(self, "use_seq_flag") and self.use_seq_flag
-        self.slots_num = len(self.getSlots())
-        self.file_list = list(file_list)
-        self.generators = map(self.generateData, self.file_list)
-        self.int_packer = struct.Struct("i")
-        self.head_packer = struct.Struct("ii")
-        self.float_packer = struct.Struct("f")
-        self.shuffler = lambda *args, **kwargs: None
-        self.data_pool = []
-        self.has_subseq = []
-        self.has_checked = False
-
-        self.debug = hasattr(self, "debug") and self.debug
-
-        if hasattr(self, "profile_filename") and isinstance(
-                self.profile_filename, str):
-            self.profile_count = 0
-            self.is_profile = True
-        else:
-            self.is_profile = False
-
-        if not hasattr(self, "file_count") or not isinstance(self.file_count,
-                                                             int):
-            self.file_count = sys.maxint
-
-        if not hasattr(self, "can_over_batch_size"):
-            self.can_over_batch_size = True
-        elif not self.can_over_batch_size:
-            self.logger.warn(
-                "User should ensure every data size is not larger than batch"
-                " size when can_over_batch_size = False")
-
-        self.data_pool_idx = 0
-
-    def reset(self):
-        """Reset all data in provider."""
-
-        self.logger.debug("reset dataprovider.")
-        self.generators = map(self.generateData, self.file_list)
-        self.shuffler = lambda *args, **kwargs: None
-        self.data_pool = []
-        self.data_pool_idx = 0
-        if self.file_count != 0:
-            self.max_pool_size = 0
-
-        # When use Profile, each pass will print a profile result.
-        if self.is_profile:
-            if hasattr(self, "profiler") and isinstance(self.profiler,
-                                                        profile.Profile):
-                self.profiler.disable()
-                fn = "%s_%d" % (self.profile_filename, self.profile_count)
-                sortby = "cumulative"
-                with open(fn, "w") as f:
-                    pstats.Stats(
-                        self.profiler,
-                        stream=f).sort_stats(sortby).print_stats()
-                self.logger.info("saving profile to file %s" % fn)
-                self.profile_count += 1
-            self.logger.info("resetting profile")
-            self.profiler = profile.Profile()
-            self.profiler.enable()
-
-    def shuffle(self):
-        """ shuffle data"""
-        if not self.should_shuffle:
-            return
-        else:
-            self.logger.debug("shuffling data.")
-            random.shuffle(self.generators)
-            self.shuffler = random.shuffle
-
-    def getSlots(self):
-        """
-        :return : return a list of SlotType
-        :rtype: list
-        """
-        return []
-
-    def generateData(self, fn):
-        """
-        :param fn: file name
-        :return: a generator to yield data one by one.
-        """
-        raise NotImplementedError
-
-    def calculateDataBatchSize(self, data):
-        """
-        :param data: One sample which yield by generateData
-        :type data: list
-        :return: The batch size that the data contribute.
-        :rtype: int
-        """
-        return 1
-
-    def getHeader(self):
-        """return paddle header format"""
-        ret = self.head_packer.pack(self.slots_num, self.use_seq_flag)
-        for obj in self.getSlots():
-            ret += self.head_packer.pack(obj.type, obj.dim)
-        return ret
-
-    def getHeaderNative(self):
-        return self.use_seq_flag, self.getSlots()
-
-    def getNextBatchNative(self, batch_size):
-        ret_list = []
-        self.__prepareData(batch_size, ret_list)
-        return ret_list
-
-    def getNextBatch(self, batch_size):
-        """
-        :param batch_size: the batch_size approximately return.
-        :return: return paddle pyDataProvider format, just see documents.
-        :rtype: str
-
-        NOTE: If can_over_batch_size is True, the return batch_size >= input batch_size.
-              Otherwise, the return batch_size < input batch_size, BUT USER MUST ENSURE THAT each data's batch size
-              is less than input batch_size.
-        """
-        ret_list = []
-        current_batch_size = self.__prepareData(batch_size, ret_list)
-        # create unified format for ret_list with differnt slots_num
-        if self.slots_num == 1:
-            ret_list = [ret_list]
-
-        if current_batch_size == 0:
-            return self.int_packer.pack(current_batch_size)
-        data_bytes = io.BytesIO()
-        seq_bytes = io.BytesIO()
-        subseq_bytes = io.BytesIO()
-        data_stream = io.BufferedWriter(data_bytes)
-        seq_stream = io.BufferedWriter(seq_bytes)
-        subseq_stream = io.BufferedWriter(subseq_bytes)
-
-        def convertDataImpl(idx, data_callback):
-            """
-            This method will handle sequence in return data. invoke data_callback one by one.
-            :param idx: the slot index.
-            :param data_callback: a callback, which type is (each sample) => None.
-            """
-            indices = 0
-            slot_sample_num = len(ret_list)
-            if self.use_seq_flag:
-                slot_sample_num = 0
-                if self.has_subseq[idx]:  # has sub-sequence
-                    slot_subseq_num = 0
-                    for dat in ret_list:
-                        dat = dat[idx]
-                        slot_subseq_num += len(dat)
-                        for sub_dat in dat:
-                            slot_sample_num += len(sub_dat)
-                    subseq_stream.write(self.int_packer.pack(slot_subseq_num))
-                else:
-                    for dat in ret_list:
-                        dat = dat[idx]
-                        slot_sample_num += len(dat)
-                seq_stream.write(self.int_packer.pack(len(ret_list)))
-            data_stream.write(self.int_packer.pack(slot_sample_num))
-
-            for dat in ret_list:
-                dat = dat[idx]
-                if self.use_seq_flag:
-                    seq_stream.write(self.int_packer.pack(indices))
-                    if self.has_subseq[idx]:  # has sub-sequence
-                        for sub_dat in dat:
-                            writeDataStream(sub_dat, data_callback)
-                            subseq_stream.write(self.int_packer.pack(indices))
-                            indices += len(sub_dat)
-                    else:
-                        writeDataStream(dat, data_callback)
-                        indices += len(dat)
-                else:
-                    writeDataStream(dat, data_callback)
-
-        def writeDataStream(dat, data_callback):
-            if self.use_seq_flag > 0:
-                if data_callback is None:  # Special for index slot
-                    data_stream.write(array.array("i", dat).tostring())
-                else:
-                    for ele in dat:
-                        data_callback(ele)
-            else:
-                if data_callback is None:  # Special for index slot
-                    data_stream.write(self.int_packer.pack(dat))
-                else:
-                    data_callback(dat)
-
-        try:
-            for i in range(self.slots_num):
-                slot = self.getSlots()[i]
-                # According to the data_type, each slot data will be converted to binary
-                if isinstance(slot, DenseSlot):
-                    convertDataImpl(i, lambda e: data_stream.write(
-                        array.array("f", e).tostring()))
-                elif isinstance(slot, SparseNonValueSlot):
-                    handler = SparseNonValueHandler()
-                    convertDataImpl(i, handler)
-                    handler.done(data_stream, self.int_packer)
-                elif isinstance(slot, SparseValueSlot):
-                    handler = SparseValueHandler()
-                    convertDataImpl(i, handler)
-                    handler.done(data_stream, self.int_packer)
-                elif isinstance(slot, IndexSlot):
-                    convertDataImpl(i, None)
-                elif isinstance(slot, StringSlot):
-                    handler = StringHandler(data_stream, self.int_packer)
-                    convertDataImpl(i, handler)
-                else:
-                    raise RuntimeError("The data_type must be 0/1/2/3/6")
-            data_stream.flush()
-            seq_stream.flush()
-            subseq_stream.flush()
-
-            return "".join([
-                self.int_packer.pack(current_batch_size), data_bytes.getvalue(),
-                seq_bytes.getvalue(), subseq_bytes.getvalue()
-            ])
-
-        finally:
-            data_stream.close()
-            seq_stream.close()
-            subseq_stream.close()
-            data_bytes.close()
-            seq_bytes.close()
-            subseq_bytes.close()
-
-    def hasSubseq(self, ret_list):
-        # create unified format for ret_list with differnt slots_num
-        if self.slots_num == 1:
-            ret_list = [ret_list]
-        # decide whether slot has sub-sequence using its first sample
-        for i in range(self.slots_num):
-            slot = self.getSlots()[i]
-            dat = ret_list[0][i][0]
-            if isinstance(slot, IndexSlot) or isinstance(slot, StringSlot):
-                if isinstance(dat, list) or isinstance(dat, numpy.ndarray):
-                    self.has_subseq.append(1)  # has_subseq = True
-                    continue
-            elif isinstance(dat[0], list) or isinstance(dat[0], numpy.ndarray):
-                self.has_subseq.append(1)  # has_subseq = True
-                continue
-            self.has_subseq.append(0)  # has_subseq = False
-
-    def checkOrder(self):
-        first_noSubseq_slot = self.slots_num
-        last_subseq_slot = -1
-        for i in range(self.slots_num):
-            if not self.has_subseq[i]:
-                first_noSubseq_slot = i
-                break
-        for i in range(self.slots_num):
-            if self.has_subseq[i]:
-                last_subseq_slot = i
-        if first_noSubseq_slot < last_subseq_slot:
-            raise RuntimeError(
-                "slot hasSubseq must put before than slot without subseq")
-        self.has_checked = True
-
-    def __prepareData(self, batch_size, ret_list):
-        current_batch_size = 0
-        could_exit = False
-        while not could_exit:
-            if len(self.data_pool) == 0:
-                self.data_pool_idx = 0
-                self.fillPool()
-            if len(self.data_pool) != 0:
-                for idx in xrange(self.data_pool_idx, len(self.data_pool)):
-                    current_batch_size += self.calculateDataBatchSize(
-                        self.data_pool[idx])
-                    if current_batch_size >= batch_size:
-                        could_exit = True
-                        break
-                if current_batch_size > batch_size and not self.can_over_batch_size:  # if cannot over batch size
-                    current_batch_size -= self.calculateDataBatchSize(
-                        self.data_pool[idx])
-                    idx -= 1
-
-                ret_list += self.data_pool[self.data_pool_idx:idx + 1]
-
-                # for speed reason, just shift left index, not delete data actually.
-                self.data_pool_idx = idx + 1
-
-                if self.data_pool_idx == len(self.data_pool):
-                    self.data_pool = []
-            else:
-                break
-        if self.use_seq_flag and not self.has_checked:  # compute self.has_subseq and checkOrder only at first time
-            self.hasSubseq(ret_list)
-            self.checkOrder()
-        return current_batch_size
-
-    def fillPool(self):
-        """
-        Fill the pool to max_pool_size. If max_pool_size is None, then read file_count to pool.
-        """
-        if self.max_pool_size == 0:
-            for i in xrange(min(self.file_count, len(self.generators))):
-                self.data_pool += list(self.generators[i])
-            self.generators = self.generators[min(self.file_count,
-                                                  len(self.generators)):]
-            self.max_pool_size = len(self.data_pool)
-        else:
-            while len(self.data_pool) < self.max_pool_size and len(
-                    self.generators) != 0:
-                try:
-                    self.data_pool.append(self.generators[0].next())
-                except StopIteration:
-                    self.generators.pop(0)
-        self.shuffler(self.data_pool)
-
-
-class PoolSize(object):
-    """Max number of sample which contains in provider."""
-
-    def __init__(self, pool_size):
-        self.size = pool_size
-
-
-def default_init_hook(cls, *args, **kwargs):
-    """ default hook, do nothing """
-    del cls, args, kwargs
-
-
-def provider(slots=None,
-             use_seq=False,
-             should_shuffle=True,
-             pool_size=1,
-             can_over_batch_size=True,
-             calc_batch_size=lambda data: 1,
-             debug=False,
-             init_hook=default_init_hook,
-             profile_filename=None):
-    """
-    The decorator for PyDataProvider. User should use this to create Provider class.
-    User should only concern how to read sample from file.
-
-    So the basic usage is:
-
-    ..  code-block:: python
-
-        @provider(some data provider config here...)
-        def process(obj, file_name):
-            while not at end of file_name:
-                sample = readOneSampleFromFile(file_name)
-                yield sample.
-
-    The configuration of data provider should be setup by:
-
-    :param init_hook: A callback will be invoked when PyDataProvider instance \
-                      created. The parameter is (obj, \*args, \*\*kwargs).
-
-                      - **obj**: actually data provider instance, which \
-                                 contains some global objects in obj.xxxxx, \
-                                 and is used by process function.
-
-                        1. **obj.slots**: a list of SlotType Object. Can be \
-                                          set in init. For example, obj.slots = \
-                                          [DenseSlot(9), IndexSlot(2)].
-                        2. **obj.logger**: a logger object. User can invoke \
-                                          obj.logger.info(), obj.logger.fatal(), etc.
-
-                      - **args** and **kwargs**: the data provider __init__ \
-                                                 parameters. For example, load_data_args \
-                                                 will be found in \*\*kwargs, \
-                                                 and if you want to recieve \
-                                                 it from trainer_config, \
-                                                 recommand to use init_hook_wrapper
-    :type init_hook: callable
-
-    :param pool_size:
-                      - **int**: it will read at most pool_size files to memory.
-                      - **PoolSize**: it will read at most PoolSize.size samples to memory.
-                      - If not set, it will read all the files to memory.
-    :type pool_size: int | PoolSize
-
-    :param slots: Specify the SlotTypes, can also be set in init_hook. It has two formats:
-
-                  - A list of SlotType objects. For example, slots = \
-                    [DenseSlot(9), IndexSlot(2)].
-                  - A method return a list of SlotTypes, and the parameter of \
-                    method is (obj, \*file_list, \*\*kwargs).
-    :type slots: list | callable
-
-    :param use_seq:  False if use no sequence (Default). True if use sequence:
-
-                     - If sequence has **no sub-sequence**: Each slot will \
-                       return a list of data. This list is one sequence. \
-                       So the return format likes \
-                       [[a0, a1, a2], [b1, b2, b3, b4], [c1]].
-                     - If sequence has **sub-sequence**: Each slot will return \
-                       a nested-list of data. This list contains several \
-                       sub-lists, each sub-list is one sub-sequence. \
-                       So the return format likes \
-                       [[[a0, a1, a2], [a4, a5]], [[b1, b2, b3, b4], [b5, b6]], [[c1], [c2]]].
-    :type use_seq: bool
-
-    :param should_shuffle: True if data should shuffle.
-    :type should_shuffle: bool
-
-    :param calc_batch_size: The method calculate each data's batch size.
-
-                            - Default is the batch size of one sample.
-                            - User can customize by **lamda** funtion. For example, \
-                              :code:`calc_batch_size = lambda data : len(data)` \
-                              means calculating the token number of a sequence data.
-    :type calc_batch_size: callable
-
-    :param can_over_batch_size: Whether :code:`actual batch size >= input batch size`
-
-                                - **True** (>=): getNextBatch method can return more data (Default).
-                                - **False** (<): user must ensure that each data's batch size < input batch size.
-    :type can_over_batch_size: bool
-
-    :param debug: True if enable debug logger and some debug check. Default is False.
-    :type debug: bool
-
-    :param profile_filename: None if disable profile (Default). Otherwise, \
-                             the data provider will dump profile result when \
-                             reset. And the dump filename is \
-                             **<profile_filename>_<reset_count>**.
-    :type profile_filename: None | Str
-    """
-
-    def _wrapper(handler):
-        class Cls(GeneralPyDataProvider):
-            """ Real PyDataProvider Class. """
-
-            def __init__(self, *file_list, **kwargs):
-                logging.basicConfig(
-                    format="[%(levelname)s %(asctime)s %(filename)s:%(lineno)s]"
-                    " %(message)s")
-
-                self.logger = logging.getLogger("")
-                if debug:
-                    self.logger.setLevel(logging.DEBUG)
-                    self.logger.debug("Running pydataprovider in debug mode.")
-                else:
-                    self.logger.setLevel(logging.INFO)
-
-                init_hook(self, *file_list, **kwargs)
-                if callable(slots):
-                    self.slots = slots(self, *file_list, **kwargs)
-                elif slots is not None:
-                    self.slots = slots
-
-                if isinstance(pool_size, int):
-                    self.max_pool_size = 0
-                    self.file_count = pool_size
-                elif isinstance(pool_size, PoolSize):
-                    self.max_pool_size = pool_size.size
-                    self.file_count = 0
-                else:
-                    raise RuntimeError
-                self.can_over_batch_size = can_over_batch_size
-                self.debug = debug
-                self.profile_filename = profile_filename
-                self.use_seq_flag = use_seq
-                self.should_shuffle = should_shuffle
-                GeneralPyDataProvider.__init__(self, *file_list, **kwargs)
-
-            def getSlots(self):
-                return self.slots
-
-            def generateData(self, f):
-                return handler(self, f)
-
-            def calculateDataBatchSize(self, data):
-                return calc_batch_size(data)
-
-        return Cls
-
-    return _wrapper
-
-
-def init_hook_wrapper(func):
-    """
-    Wrap a method for PyDataProviderWrapper's init_hook. This method can
-    receive parameter from trainer_config's load_data_args. The load_data_args
-    must pass a pickle.dumps() value, and dump a map as keyword args. The
-    wrapped method :code:`func` will receive them as keyword args.
-
-    So an example usage is:
-
-    ..  code-block:: python
-
-        @init_hook_wrapper
-        def hook(obj, dictionary, file_list, **kwargs):
-            obj.dictionary = dictionary
-            obj.slots = [IndexSlot(len(obj.dictionary)),
-                         IndexSlot(len(open(file_list[0], "r").readlines()))]
-
-    :param func: init_hook function
-    :type func: callable
-    :return: wrapped method, can be passed into @provider.
-    """
-
-    @functools.wraps(func)
-    def wrapper(obj, *file_list, **kwargs):
-        args = kwargs.get("load_data_args", dict())
-        if isinstance(args, basestring):
-            args = pickle.loads(args)
-        args['file_list'] = file_list
-        func(obj=obj, **args)
-
-    return wrapper
diff --git a/python/paddle/trainer/__init__.py b/python/paddle/trainer/__init__.py
deleted file mode 100644
index f662d6826..000000000
--- a/python/paddle/trainer/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
deleted file mode 100644
index 5b90facd4..000000000
--- a/python/paddle/trainer/config_parser.py
+++ /dev/null
@@ -1,4447 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-'''
-The following functions are available in the config file:
-
-Bias: define bias. To be used as value of bias argument in Layer().
-
-Data: define data provider.
-
-Input: define input layer for a layer. To be used as element of inputs argument
-       in Layer().
-
-Conv: define a convolution operation for an input of a layer.
-
-Norm: define a normalization operation for an input of a layer.
-
-Pool: define a pooling operation for an input of a layer.
-
-Layer: define a layer.
-
-Parameter: define a parameter.
-
-Import: import another config file. If the imported config file name is
-        a relative path, then it will be searched under the directory of the
-        current config file.
-
-Inputs(layer_names...):
-    Define the name of the input layers of the NeuralNetwork.
-    The type of these layers must be "data".
-    These layers will be provided with the DataBatch obtained
-    from DataProvider. The data streams from DataProvider must
-    have the same order.
-
-Outputs(layer_names...):
-    Define the name of the output layers of the NeuralNetwork.
-    Usually the output is simply the cost layer.
-    You can specify other layers as outputs and  calculate the
-    cost (and its derivative) yourself.
-
-
-default_initial_std(val)
-default_initial_mean(val)
-default_momentum(val):
-default_decay_rate(val): Set the default value for these parameters
-
-
-get_config_arg(name, type, default): Get the value for a config parameter.
-
-
-*** customized extension to config_parser ***
-The functionality of the config_parser can be extended.
-If the config_arg_str for parse_config() contains
-extension_module_name=[MODULE_NAME], then config_parser will call
-MODULE_NAME.get_config_funcs(g_config)
-MODULE_NAME.get_config_funcs() should return a dictionary of name to functions,
-those functions will be available in the config file.
-See legacy/trainer/tests/config_parser_test.py for example
-
-To use this from paddle_trainer, paddle_trainer should be called with
---config_args=extension_module_name=[MODULE_NAME]
-
-'''
-import copy
-import logging
-import os
-import sys
-import traceback
-import math
-import shutil
-
-try:
-    from paddle.proto.DataConfig_pb2 import DataConfig
-    from paddle.proto.ModelConfig_pb2 import ModelConfig
-    from paddle.proto.ModelConfig_pb2 import LayerConfig
-    from paddle.proto.ModelConfig_pb2 import LayerInputConfig
-    from paddle.proto.ModelConfig_pb2 import ProjectionConfig
-    from paddle.proto.ModelConfig_pb2 import OperatorConfig
-    from paddle.proto.ModelConfig_pb2 import GeneratorConfig
-    from paddle.proto.ModelConfig_pb2 import LinkConfig
-    from paddle.proto.ParameterConfig_pb2 import ParameterConfig
-    from paddle.proto.ParameterConfig_pb2 import ParameterUpdaterHookConfig
-    from paddle.proto.TrainerConfig_pb2 import TrainerConfig
-
-except Exception as e:
-    traceback.print_exc()
-    raise
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-logger = logging.getLogger('paddle')
-logger.setLevel(logging.INFO)
-__real_print__ = print
-print = logger.info
-
-# from layer type name to layer class
-g_layer_type_map = {}
-
-
-# Initialize global variables. We use this function so that we can
-# call parse_config() multiple times
-def init_config_environment(
-        g_default_momentum=None,
-        g_default_decay_rate=None,
-        g_default_initial_mean=0.,
-        g_default_initial_std=0.01,
-        g_default_num_batches_regularization=None,
-        g_default_initial_strategy=0,
-        g_default_initial_smart=False,
-        g_default_gradient_clipping_threshold=None,
-        g_default_device=None,
-        g_default_update_hooks=None,
-        g_default_compact_func=None,
-        g_config=TrainerConfig(),
-        g_layer_map={},
-        g_parameter_map={},
-        g_parameter_initializer_map={},
-        g_extended_config_funcs={},
-
-        # store command args of paddle_trainer
-        g_command_config_args={},
-
-        # Used for PyDataProvider to avoid duplicate module name
-        g_py_module_name_list=[],
-        g_current_submodel=None,
-        g_root_submodel=None,
-        g_submodel_map={},
-        g_submodel_stack=[],
-        g_add_submodel_suffix=False, ):
-
-    # directly iterate through locals().iteritems() will change
-    # the size of locals() due to introducing k, v into scope
-    # which will break the process in some env
-
-    local_vars = copy.deepcopy(locals())
-    for k, v in local_vars.iteritems():
-        globals()[k] = v
-
-
-# Because type is widely used as a variable name in this code.
-# we need a different function name for the builtin type()
-def type_of(x):
-    return type(x)
-
-
-# Check a condition derived config file
-def config_assert(b, msg):
-    if not b:
-        logger.fatal(msg)
-
-
-g_config_funcs = {}
-
-
-# decorator for indicating a function which can be used in config file
-def config_func(func):
-    g_config_funcs[func.func_name] = func
-    return func
-
-
-# decorator for indicating a class which can be used in config file
-def config_class(cls):
-    g_config_funcs[cls.__name__] = cls
-    return cls
-
-
-# decorator for indicating a class for a layer type
-def config_layer(layer_type):
-    def wrap(cls):
-        g_config_funcs[cls.__name__] = cls
-        g_layer_type_map[layer_type] = cls
-        return cls
-
-    return wrap
-
-
-def gen_parameter_name(layer_name, input_index):
-    return '_%s.w%d' % (layer_name, input_index)
-
-
-def gen_bias_parameter_name(layer_name):
-    return '_%s.wbias' % layer_name
-
-
-def default(x, default_value):
-    return default_value if x is None else x
-
-
-class Cfg(object):
-    def add_keys(self, locals):
-        for k, v in locals.iteritems():
-            if not k.startswith('_'):
-                self.__setattr__(k, v)
-
-
-# functions available in config file
-
-
-# Define the name of the input layers of the NeuralNetwork.
-# The type of these layers must be "data".
-# These layers will be provided with the DataBatch obtained
-# from DataProvider. The data streams from DataProvider must
-# have the same order.
-@config_func
-def Inputs(*args):
-    for name in args:
-        name = MakeLayerNameInSubmodel(name)
-        global g_current_submodel, g_root_submodel
-        if g_current_submodel.is_recurrent_layer_group:
-            config_assert(False, "Do not set Inputs in recurrent layer group")
-        else:
-            g_current_submodel.input_layer_names.append(name)
-
-        if g_current_submodel is g_root_submodel:
-            g_config.model_config.input_layer_names.append(name)
-
-
-@config_func
-def HasInputsSet():
-    return len(g_current_submodel.input_layer_names) != 0
-
-
-# Define the name of the output layers of the NeuralNetwork.
-# Usually the output is simply the cost layer.
-# You can specify other layers as outputs and calculate the
-# cost (and its derivative) yourself.
-@config_func
-def Outputs(*args):
-    for name in args:
-        name = MakeLayerNameInSubmodel(name)
-        global g_current_submodel, g_root_submodel
-        if g_current_submodel.is_recurrent_layer_group:
-            config_assert(False, "Do not set Outputs in recurrent layer group")
-        else:
-            g_current_submodel.output_layer_names.append(name)
-
-        if g_current_submodel is g_root_submodel:
-            g_config.model_config.output_layer_names.append(name)
-
-
-@config_func
-def SubModelBegin(name):
-    global g_current_submodel, g_root_submodel, g_submodel_stack
-    g_submodel_stack.append(g_current_submodel)
-
-    name = MakeLayerNameInParentSubmodel(name)  #rename in nested submodel
-
-    config_assert(name not in g_submodel_map,
-                  'Duplicated submodel name: %s' % name)
-
-    sub_model = g_config.model_config.sub_models.add()
-    sub_model.name = name
-    g_submodel_map[name] = sub_model
-    g_current_submodel = sub_model
-
-
-@config_func
-def SubModelEnd(name=None):
-    global g_current_submodel, g_root_submodel, g_submodel_stack
-    config_assert(g_current_submodel is not g_root_submodel,
-                  "submodel not begin")
-    if name is not None:
-        config_assert(
-            g_current_submodel.name == MakeLayerNameInParentSubmodel(name),
-            "submodel name error")
-
-    g_current_submodel = g_submodel_stack.pop()
-
-
-def MakeLayerNameInParentSubmodel(name):
-    suffix = ""
-    if len(g_submodel_stack) > 1:
-        suffix = "@" + g_submodel_stack[-1].name
-    return name + suffix
-
-
-def GetLayerBaseName(name):
-    return name.split('@')[0]
-
-
-def MakeLayerNameInSubmodel(name, submodel_name=None):
-    global g_current_submodel
-    global g_add_submodel_suffix
-    if (submodel_name is None and not g_add_submodel_suffix and
-            not g_current_submodel.is_recurrent_layer_group):
-        return name
-    if submodel_name is None:
-        submodel_name = g_current_submodel.name
-    return name + "@" + submodel_name
-
-
-# Define a recurrent layer group begin with RecurrentLayerGroupBegin
-# and end with RecurrentLayerGroupEnd.
-# A recurrent layer group forward/backward one frame after previous frame
-# forward/backward through all layers in layer group.
-# in_links are names of layer used as input layer in the layer group.
-# out_links are names of layer in layer group used as outside layer's input.
-#
-# If generator is set, the layer group need one or more than one outlinks.
-# The first outlink should always be the generated token ids.
-# If generator.num_results_per_sample is not set, the output for one sample is
-# a ids sequence. Else if num_results_per_sample is more than one,
-# the output for one sample is up to #num_results_per_sample generated
-# sequences, which are packed in one sequence in output ids vector. Each
-# generated sequence has a generation probability. The probabilities for one
-# sample are stored in one row of output value matrix.
-# Packed generated sequences format, for each i:
-#   seq_i_length: one interger, seq_i content length,
-#   [seq_i content], length = seq_i_length
-#   seq_i_end_mark: one interger, for format check, always -1
-# You can use "seq_text_printer" to print the output of the generator.
-@config_func
-def RecurrentLayerGroupWithoutOutLinksBegin(name,
-                                            in_links,
-                                            seq_reversed=False,
-                                            target_inlinkname=""):
-    global g_current_submodel
-    config_assert(g_config.model_config.type == "recurrent_nn",
-                  "RecurrentLayerGroup should be used only in recurrent_nn")
-    RecurrentLayerGroup(name=name)  # add to father model
-    SubModelBegin(name)
-    g_current_submodel.is_recurrent_layer_group = True
-    g_current_submodel.reversed = seq_reversed
-    in_links_count = 0
-    for linkid, link in enumerate(in_links):
-        if isinstance(link, basestring):
-            name = link
-        else:
-            name = link.link_name
-
-        in_links_count += 1
-        layer_name = MakeLayerNameInParentSubmodel(name)
-        layer = g_layer_map[layer_name]
-        ScatterAgentLayer(
-            name=name, size=layer.size, width=layer.width, height=layer.height)
-
-        pair = g_current_submodel.in_links.add()
-        pair.layer_name = layer_name
-        pair.link_name = MakeLayerNameInSubmodel(name)
-
-
-@config_func
-def RecurrentLayerGroupSetOutLink(link):
-    if isinstance(link, basestring):
-        name = link
-    else:
-        name = link.link_name
-    layer_name = MakeLayerNameInParentSubmodel(name)
-    pair = g_current_submodel.out_links.add()
-    pair.layer_name = MakeLayerNameInSubmodel(name)
-    pair.link_name = layer_name
-
-
-def RecurrentLayerGroupSetGenerator(generator=None):
-    generator.eos_layer_name = MakeLayerNameInSubmodel(generator.eos_layer_name)
-    g_current_submodel.generator.CopyFrom(generator)
-
-
-@config_func
-def RecurrentLayerGroupBegin(name,
-                             in_links,
-                             out_links,
-                             generator=None,
-                             target_inlinkname="",
-                             seq_reversed=False):
-    RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, seq_reversed)
-    for link in out_links:
-        RecurrentLayerGroupSetOutLink(link)
-
-    if generator is not None:
-        RecurrentLayerGroupSetGenerator(generator)
-        config_assert(
-            len(in_links) == 0, "no in_links should be passed to generator")
-        config_assert(
-            len(out_links) >= 1,
-            "one or more than one out_links should be passed to generator")
-
-
-@config_func
-def RecurrentLayerGroupEnd(name):
-    global g_current_submodel
-    config_assert(g_current_submodel.is_recurrent_layer_group,
-                  "RecurrentLayerGroup not begin")
-    for pair in g_current_submodel.memories:  #check exist
-        layer = g_layer_map[pair.layer_name]
-        config_assert(layer is not None,
-                      "memory declare wrong name:%s" % pair.layer_name)
-        memory_link = g_layer_map[pair.link_name]
-        config_assert(layer.size == memory_link.size,
-                      "memory declare wrong size:%d" % memory_link.size)
-
-    prev_submodel = g_current_submodel
-    SubModelEnd(name)
-
-    for pair in prev_submodel.out_links:
-        layer = g_layer_map[pair.layer_name]
-        # add out agent to father model
-        agent_name = GetLayerBaseName(pair.link_name)
-        if prev_submodel.HasField("generator"):
-            DataLayer(name=agent_name, size=layer.size)
-        else:
-            GatherAgentLayer(name=agent_name, size=layer.size)
-
-
-# Define the model type
-# currently, the paddle supports "nn", "recurrent_nn", "recursive_nn" and "multi_nn"
-@config_func
-def model_type(name):
-    g_config.model_config.type = name
-
-
-@config_class
-class Bias(Cfg):
-    def __init__(self,
-                 parameter_name=None,
-                 learning_rate=None,
-                 momentum=None,
-                 decay_rate=None,
-                 decay_rate_l1=None,
-                 initial_mean=None,
-                 initial_std=None,
-                 initial_strategy=None,
-                 initial_smart=None,
-                 num_batches_regularization=None,
-                 sparse_remote_update=None,
-                 gradient_clipping_threshold=None,
-                 is_static=None,
-                 is_shared=None,
-                 initializer=None):
-        self.add_keys(locals())
-
-
-# Define one input for a layer
-@config_class
-class Input(Cfg):
-    def __init__(
-            self,
-            input_layer_name,
-            parameter_name=None,
-            initializer=None,
-            learning_rate=None,
-            momentum=None,
-            decay_rate=None,
-            decay_rate_l1=None,
-            initial_mean=None,
-            initial_std=None,
-            initial_strategy=None,
-            initial_smart=None,
-            num_batches_regularization=None,
-            sparse_remote_update=None,
-            sparse_update=None,
-            gradient_clipping_threshold=None,
-            conv=None,
-            bilinear_interp=None,
-            norm=None,
-            pool=None,
-            image=None,
-            block_expand=None,
-            maxout=None,
-            spp=None,
-            pad=None,
-            upsample=None,
-            format=None,
-            nnz=None,
-            is_static=None,
-            is_shared=None,
-            update_hooks=None,
-            input_layer_argument=None,
-            make_layer_name_in_submodel=True, ):
-        """
-        @param make_layer_name_in_submodel True by defalut, you might need to
-        set it carefully when adding Input in config_parser.py.
-        """
-        self.add_keys(locals())
-        self.input_layer_name = MakeLayerNameInSubmodel(
-            input_layer_name
-        ) if make_layer_name_in_submodel else input_layer_name
-
-
-# Define a projection for iexed layer
-@config_class
-class Projection(Input):
-    type = None  # subclass should set it correctly
-
-    def __init__(
-            self,
-            input_layer_name,
-            size=0,  # projection output size
-            parameter_name=None,
-            learning_rate=None,
-            momentum=None,
-            decay_rate=None,
-            decay_rate_l1=None,
-            initial_mean=None,
-            initial_std=None,
-            initial_strategy=None,
-            initial_smart=None,
-            initializer=None,
-            num_batches_regularization=None,
-            sparse_remote_update=None,
-            sparse_update=None,
-            gradient_clipping_threshold=None,
-            ptype=None,
-            format=None,
-            nnz=None,
-            is_static=None,
-            is_shared=None,
-            update_hooks=None,
-            input_layer_argument=None, ):
-        self.add_keys(locals())
-        self.input_layer_name = MakeLayerNameInSubmodel(input_layer_name)
-
-        self.proj_conf = ProjectionConfig()
-        if ptype is not None:
-            self.proj_conf.type = ptype
-        else:
-            self.proj_conf.type = self.type
-
-    # calculate the output_size given input_size. return 0
-    # to indicate using the size from Layer config
-    def calc_output_size(self, input_layer_config):
-        return self.size
-
-    def calc_parameter_size(self, input_size, output_size):
-        raise NotimplementedError
-
-    def calc_parameter_dims(self, input_size, output_size):
-        raise NotimplementedError
-
-
-@config_class
-class IdentityProjection(Projection):
-    type = 'identity'
-
-    def calc_output_size(self, input_layer_config):
-        return input_layer_config.size
-
-    def calc_parameter_size(self, input_size, output_size):
-        return 0
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return []
-
-
-# Like IdentityProjection, but layer size may smaller than input size,
-# the projection select dimesions [offset, offset+layer_size) from input
-@config_class
-class IdentityOffsetProjection(Projection):
-    type = 'identity_offset'
-
-    def __init__(self, input_layer_name, offset, **xargs):
-        super(IdentityOffsetProjection, self).__init__(input_layer_name,
-                                                       **xargs)
-        self.proj_conf.offset = offset
-
-    def calc_output_size(self, input_layer_config):
-        return 0  # depends on the outside MixedLayer
-
-    def calc_parameter_size(self, input_size, output_size):
-        return 0
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return []
-
-
-@config_class
-class SliceProjection(Projection):
-    type = 'slice'
-
-    def __init__(self, input_layer_name, slices, **xargs):
-        super(SliceProjection, self).__init__(input_layer_name, **xargs)
-        input = g_layer_map[input_layer_name]
-        if input.type in ["exconv", "cudnn_conv"]:
-            # the slice operator is for the channel dimension
-            assert input.num_filters is not None
-            channels = input.num_filters
-            image_size = input.size / channels
-            assert slices[len(slices) - 1][1] <= channels
-            for i in xrange(len(slices)):
-                slice = self.proj_conf.slices.add()
-                slice.start = slices[i][0] * image_size
-                slice.end = slices[i][1] * image_size
-                self.size += slice.end - slice.start
-        else:
-            config_assert(False,
-                          'Currently the input should be convolution layer')
-
-    def calc_parameter_size(self, input_size, output_size):
-        return 0
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return []
-
-
-# DotMulProjection performs element-wise multiplication with weight
-@config_class
-class DotMulProjection(Projection):
-    type = 'dot_mul'
-
-    def calc_output_size(self, input_layer_config):
-        return input_layer_config.size
-
-    def calc_parameter_size(self, input_size, output_size):
-        return output_size
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [1, output_size]
-
-
-# ScalingProjection
-@config_class
-class ScalingProjection(Projection):
-    type = 'scaling'
-
-    def calc_output_size(self, input_layer_config):
-        return input_layer_config.size
-
-    def calc_parameter_size(self, input_size, output_size):
-        return 1
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [1, 1]
-
-
-@config_class
-class TableProjection(Projection):
-    type = 'table'
-
-    def calc_parameter_size(self, input_size, output_size):
-        return input_size * output_size
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [input_size, output_size]
-
-
-@config_class
-class FullMatrixProjection(Projection):
-    type = 'fc'
-
-    def calc_parameter_size(self, input_size, output_size):
-        return input_size * output_size
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [input_size, output_size]
-
-
-@config_class
-class TransposedFullMatrixProjection(Projection):
-    type = 'trans_fc'
-
-    def calc_parameter_size(self, input_size, output_size):
-        return input_size * output_size
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [output_size, input_size]
-
-
-@config_class
-class ContextProjection(Projection):
-    type = 'context'
-
-    def __init__(self, input_layer_name, context_start, context_length,
-                 trainable_padding, **xargs):
-        super(ContextProjection, self).__init__(input_layer_name, **xargs)
-        self.proj_conf.context_start = context_start
-        self.proj_conf.context_length = context_length
-        self.proj_conf.trainable_padding = trainable_padding
-        self._total_pad = max(0, -self.proj_conf.context_start) \
-                          + max(0, self.proj_conf.context_start \
-                                + self.proj_conf.context_length - 1)
-
-    def calc_output_size(self, input_layer_config):
-        return input_layer_config.size * self.proj_conf.context_length
-
-    def calc_parameter_size(self, input_size, output_size):
-        if self.proj_conf.trainable_padding == False:
-            return 0
-        else:
-            return input_size * self._total_pad
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return [self._total_pad, input_size]
-
-    _total_pad = 0
-
-
-@config_class
-class ConvBaseProjection(Projection):
-    def __init__(self,
-                 input_layer_name,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvBaseProjection, self).__init__(input_layer_name, **xargs)
-
-        if num_filters is not None:
-            self.proj_conf.num_filters = num_filters
-
-    def calc_output_size(self, input_layer_config):
-        return self.proj_conf.output_size
-
-    def calc_parameter_size(self, input_size, output_size):
-        co = self.proj_conf.num_filters
-        ci = self.proj_conf.conv_conf.channels
-        fh = self.proj_conf.conv_conf.filter_size
-        fw = self.proj_conf.conv_conf.filter_size_y
-        gr = self.proj_conf.conv_conf.groups
-        return co * ci * fh * fw / gr
-
-    def calc_bias_size(self):
-        return self.proj_conf.num_filters
-
-    def calc_parameter_dims(self, input_size, output_size):
-        return None
-
-
-@config_class
-class ConvProjection(ConvBaseProjection):
-    type = 'conv'
-
-    def __init__(self,
-                 input_layer_name,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvProjection, self).__init__(input_layer_name, num_filters,
-                                             conv_conf, **xargs)
-
-        parse_conv(conv_conf, self.input_layer_name, self.proj_conf.conv_conf,
-                   num_filters)
-        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
-                                     self.proj_conf.conv_conf.output_y * \
-                                     num_filters
-
-
-@config_class
-class ConvTransProjection(ConvBaseProjection):
-    type = 'convt'
-
-    def __init__(self,
-                 input_layer_name,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvTransProjection, self).__init__(input_layer_name, num_filters,
-                                                  conv_conf, **xargs)
-
-        parse_conv(
-            conv_conf,
-            self.input_layer_name,
-            self.proj_conf.conv_conf,
-            num_filters,
-            trans=True)
-        self.proj_conf.output_size = self.proj_conf.conv_conf.img_size_y * \
-                                     self.proj_conf.conv_conf.img_size * \
-                                     num_filters
-
-
-# Define a operator for mixed layer
-@config_class
-class Operator(Cfg):
-    type = None  # subclass should set it correctly
-
-    def __init__(
-            self,
-            input_layer_names, ):
-        self.add_keys(locals())
-        self.operator_conf = OperatorConfig()
-        self.operator_conf.type = self.type
-
-    def check_dims(self):
-        pass
-
-    def calc_output_size(self, input_sizes):
-        return 0
-
-
-@config_class
-class DotMulOperator(Operator):
-    type = 'dot_mul'
-
-    def __init__(self, input_layer_names, scale=None, **xargs):
-        super(DotMulOperator, self).__init__(input_layer_names, **xargs)
-        if scale is not None:
-            self.operator_conf.dotmul_scale = scale
-
-        config_assert(len(input_layer_names) == 2, "DotMul is binary operator")
-
-    def check_dims(self):
-        for i in range(2):
-            config_assert(self.operator_conf.input_sizes[i] ==
-                          self.operator_conf.output_size,
-                          "DotMul input_size != output_size")
-
-    def calc_output_size(self, input_sizes):
-        return input_sizes[0]
-
-
-@config_class
-class ConvOperator(Operator):
-    type = 'conv'
-
-    def __init__(self,
-                 input_layer_names,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvOperator, self).__init__(input_layer_names, **xargs)
-        if num_filters is not None:
-            self.operator_conf.num_filters = num_filters
-
-        parse_conv(conv_conf,
-                   MakeLayerNameInSubmodel(input_layer_names[0]),
-                   self.operator_conf.conv_conf, num_filters)
-        self.operator_conf.output_size = self.operator_conf.conv_conf.output_x * \
-                                         self.operator_conf.conv_conf.output_y * \
-                                         num_filters
-
-        config_assert(len(input_layer_names) == 2, "Conv is binary operator")
-
-    def calc_output_size(self, input_sizes):
-        return self.operator_conf.output_size
-
-
-@config_class
-class ConvTransOperator(Operator):
-    type = 'convt'
-
-    def __init__(self,
-                 input_layer_names,
-                 num_filters=None,
-                 conv_conf=None,
-                 **xargs):
-        super(ConvTransOperator, self).__init__(input_layer_names, **xargs)
-        if num_filters is not None:
-            self.operator_conf.num_filters = num_filters
-
-        parse_conv(
-            conv_conf,
-            MakeLayerNameInSubmodel(input_layer_names[0]),
-            self.operator_conf.conv_conf,
-            num_filters,
-            trans=True)
-        self.operator_conf.output_size = \
-            self.operator_conf.conv_conf.img_size * \
-            self.operator_conf.conv_conf.img_size_y * \
-            num_filters
-
-        config_assert(len(input_layer_names) == 2, "Conv is binary operator")
-
-    def calc_output_size(self, input_sizes):
-        return self.operator_conf.output_size
-
-
-# please refer to the comments in proto/ModelConfig.proto
-@config_class
-class Conv(Cfg):
-    def __init__(self,
-                 filter_size,
-                 channels,
-                 padding=None,
-                 stride=None,
-                 groups=None,
-                 filter_channels=None,
-                 output_x=None,
-                 img_size=None,
-                 caffe_mode=True,
-                 filter_size_y=None,
-                 padding_y=None,
-                 stride_y=None,
-                 dilation=None,
-                 dilation_y=None):
-        self.add_keys(locals())
-        if filter_size_y is None:
-            self.filter_size_y = filter_size
-        if padding_y is None:
-            self.padding_y = padding
-        if dilation_y is None:
-            self.dilation_y = dilation
-        if stride_y is None:
-            self.stride_y = stride
-        if output_x is not None:
-            config_assert(output_x <= 0)
-
-
-# please refer to the comments in proto/ModelConfig.proto
-@config_class
-class Conv3D(Cfg):
-    def __init__(self,
-                 filter_size,
-                 channels,
-                 padding=None,
-                 stride=None,
-                 groups=None,
-                 filter_channels=None,
-                 output_x=None,
-                 img_size=None,
-                 caffe_mode=True,
-                 filter_size_y=None,
-                 padding_y=None,
-                 stride_y=None,
-                 filter_size_z=None,
-                 padding_z=None,
-                 stride_z=None):
-        self.add_keys(locals())
-        self.filter_size_y = filter_size_y if filter_size_y else filter_size
-        self.filter_size_z = filter_size_z if filter_size_z else filter_size
-        self.padding_y = padding_y if padding_y else padding
-        self.padding_z = padding_z if padding_z else padding
-        self.stride_y = stride_y if stride_y else stride
-        self.stride_z = stride_z if stride_z else stride
-        if output_x is not None:
-            config_assert(output_x <= 0)
-
-
-@config_class
-class BilinearInterp(Cfg):
-    def __init__(self, out_size_x=None, out_size_y=None, channels=None):
-        self.add_keys(locals())
-
-
-@config_class
-class Pool(Cfg):
-    def __init__(
-            self,
-            pool_type,
-            channels,
-            size_x,
-            size_y=None,
-            start=None,
-            stride=None,  # 1 by defalut in protobuf
-            stride_y=None,
-            padding=None,  # 0 by defalut in protobuf
-            padding_y=None):
-        self.add_keys(locals())
-
-
-@config_class
-class Pool3d(Cfg):
-    def __init__(
-            self,
-            pool_type,
-            channels,
-            size_x,
-            size_y=None,
-            size_z=None,
-            start=None,
-            stride=None,  # 1 by defalut in protobuf
-            stride_y=None,
-            stride_z=None,
-            padding=None,  # 0 by defalut in protobuf
-            padding_y=None,
-            padding_z=None):
-        self.add_keys(locals())
-        self.filter_size_y = size_y if size_y else size_x
-        self.filter_size_z = size_z if size_z else size_x
-        self.padding_y = padding_y if padding_y else padding
-        self.padding_z = padding_z if padding_z else padding
-        self.stride_y = stride_y if stride_y else stride
-        self.stride_z = stride_z if stride_z else stride
-
-
-@config_class
-class SpatialPyramidPool(Cfg):
-    def __init__(self, pool_type, pyramid_height, channels):
-        self.add_keys(locals())
-
-
-@config_class
-class Pad(Cfg):
-    def __init__(self, channels, pad_c, pad_h, pad_w):
-        self.add_keys(locals())
-
-
-@config_class
-class Upsample(Cfg):
-    def __init__(self, scale, scale_y, pad_out_x, pad_out_y, upsample_size,
-                 upsample_size_y):
-        self.add_keys(locals())
-
-
-@config_class
-class Norm(Cfg):
-    def __init__(self,
-                 norm_type,
-                 channels,
-                 size,
-                 scale,
-                 pow,
-                 output_x=None,
-                 img_size=None,
-                 blocked=None):
-        self.add_keys(locals())
-
-
-@config_class
-class Image(Cfg):
-    def __init__(self, channels, img_size=None):
-        self.add_keys(locals())
-
-
-@config_class
-class BlockExpand(Cfg):
-    def __init__(self,
-                 channels,
-                 padding_x=0,
-                 padding_y=0,
-                 stride_x=0,
-                 stride_y=0,
-                 block_x=0,
-                 block_y=0,
-                 img_size_x=0,
-                 img_size_y=0,
-                 output_x=0,
-                 output_y=0):
-        self.add_keys(locals())
-
-
-@config_class
-class MaxOut(Cfg):
-    def __init__(self, channels, groups, img_size_x=0, img_size_y=0):
-        self.add_keys(locals())
-
-
-def create_data_config_proto(async_load_data=False,
-                             constant_slots=None,
-                             data_ratio=1,
-                             is_main_data=True,
-                             usage_ratio=None):
-    # default: all sub dataproviders are treat as "main data".
-    # see proto/DataConfig.proto for is_main_data
-    data_config = DataConfig()
-
-    data_config.async_load_data = async_load_data
-
-    if constant_slots:
-        data_config.constant_slots.extend(constant_slots)
-    data_config.data_ratio = data_ratio
-    data_config.is_main_data = is_main_data
-
-    usage_ratio = default(usage_ratio, settings_deprecated["usage_ratio"])
-    config_assert(usage_ratio >= 0 and usage_ratio <= 1,
-                  "The range of usage_ratio is [0, 1]")
-    data_config.usage_ratio = usage_ratio
-
-    return data_config
-
-
-@config_func
-def SimpleData(files=None,
-               feat_dim=None,
-               context_len=None,
-               buffer_capacity=None,
-               **xargs):
-    data_config = create_data_config_proto(**xargs)
-    data_config.type = 'simple'
-    data_config.files = files
-    data_config.feat_dim = feat_dim
-    if context_len is not None:
-        data_config.context_len = context_len
-    if buffer_capacity:
-        data_config.buffer_capacity = buffer_capacity
-    return data_config
-
-
-@config_func
-def PyData(files=None,
-           type=None,
-           file_group_queue_capacity=None,
-           load_data_module=None,
-           load_data_object=None,
-           load_data_args="",
-           load_file_count=None,
-           constant_slots=None,
-           load_thread_num=None,
-           **xargs):
-    data_config = create_data_config_proto(**xargs)
-    data_config.type = 'py'
-    if load_data_module in g_py_module_name_list:
-
-        def get_path(module):
-            m = __import__(load_data_module)
-            return os.path.split(os.path.realpath(m.__file__))[0]
-
-        # python C-api is not thread safe, one module can only be import once,
-        # so here we nedd to copy the module with different names if it has to be
-        # imported several times.
-        module_new_name = "%s_copy_%d" % (load_data_module,
-                                          len(g_py_module_name_list))
-        g_py_module_name_list.append(module_new_name)
-        module_path = "%s/%s.py" % (get_path(load_data_module),
-                                    load_data_module)
-        new_module_path = "%s/%s.py" % (get_path(load_data_module),
-                                        module_new_name)
-        if os.path.isfile(module_path) == False:
-            raise Exception("File %s is not exist." % module_path)
-        shutil.copy2(module_path, new_module_path)
-        load_data_module = module_new_name
-    else:
-        g_py_module_name_list.append(load_data_module)
-    if load_data_module is not None and load_data_object is not None:
-        data_config.load_data_module = load_data_module
-        data_config.load_data_object = load_data_object
-    else:
-        raise ValueError('load_data_module, load_data_object is not defined.')
-    data_config.load_data_args = load_data_args
-
-    data_config.files = files or ''
-    if file_group_queue_capacity is not None:
-        data_config.file_group_conf.queue_capacity = file_group_queue_capacity
-    if load_file_count is not None:
-        data_config.file_group_conf.load_file_count = load_file_count
-    if load_thread_num is not None:
-        data_config.file_group_conf.load_thread_num = load_thread_num
-    if constant_slots:
-        data_config.constant_slots.extend(constant_slots)
-    return data_config
-
-
-#real data for training is actually provided by "sub_data" data providers.
-@config_func
-def MultiData(sub_data=[]):
-    data_config = DataConfig()
-    data_config.type = 'multi'
-    data_config.sub_data_configs.extend(sub_data)
-    return data_config
-
-
-@config_func
-def Data(type,
-         files=None,
-         feat_dim=None,
-         slot_dims=None,
-         context_len=None,
-         buffer_capacity=None,
-         **xargs):
-
-    data_config = create_data_config_proto(**xargs)
-    data_config.type = type
-    data_config.files = files
-    data_config.feat_dim = feat_dim
-    data_config.slot_dims.extend(slot_dims)
-    if context_len is not None:
-        data_config.context_len = context_len
-    data_config.buffer_capacity = buffer_capacity
-    return data_config
-
-
-@config_func
-def TrainData(data_config, async_load_data=None):
-    config_assert(not g_config.HasField('data_config'),
-                  'Only one TrainData definition is allowed')
-    g_config.data_config.CopyFrom(data_config)
-    g_config.data_config.for_test = False
-    if async_load_data is not None:
-        logger.warning("Deprecated: async_load_data should be used inside"
-                       " Data definition")
-        g_config.data_config.async_load_data = async_load_data
-
-
-@config_func
-def TestData(data_config, async_load_data=None):
-    config_assert(not g_config.HasField('test_data_config'),
-                  'Only one TestData definition is allowed')
-    g_config.test_data_config.CopyFrom(data_config)
-    g_config.test_data_config.for_test = True
-    if async_load_data is not None:
-        logger.warning("Deprecated: async_load_data should be used inside"
-                       " Data definition")
-        g_config.test_data_config.async_load_data = async_load_data
-
-
-#caffe_mode: compute the output size using floor instead of ceil,
-#            which is consistent of caffe and CuDNN's convention.
-def cnn_output_size(img_size,
-                    filter_size,
-                    padding,
-                    stride,
-                    caffe_mode,
-                    dilation=1):
-    filter_s = (filter_size - 1) * dilation + 1
-    output = (2 * padding + img_size - filter_s) / float(stride)
-    if caffe_mode:
-        return 1 + int(math.floor(output))
-    else:
-        return 1 + int(math.ceil(output))
-
-
-#calcualte image_size based on output_size for de-convolution (ConvTransLayer).
-#It is the reverse function of cnn_output_size
-def cnn_image_size(output_size,
-                   filter_size,
-                   padding,
-                   stride,
-                   caffe_mode,
-                   dilation=1):
-    filter_s = (filter_size - 1) * dilation + 1
-    img_size = (output_size - 1) * stride + filter_s - 2 * padding
-    if not caffe_mode:
-        img_size = img_size + 1
-    return img_size
-
-
-def get_img_size(input_layer_name, channels):
-    input = g_layer_map[input_layer_name]
-    img_pixels = input.size / channels
-    img_size = input.width if input.width > 0 else int(img_pixels**0.5)
-    img_size_y = input.height if input.height > 0 else int(img_pixels /
-                                                           img_size)
-    config_assert(
-        img_size * img_size_y == img_pixels,
-        "Input layer %s: Incorrect input image size %d * %d for input image pixels %d"
-        % (input_layer_name, img_size, img_size_y, img_pixels))
-    return img_size, img_size_y
-
-
-def get_img3d_size(input_layer_name, channels):
-    input = g_layer_map[input_layer_name]
-    img_pixels = input.size / channels
-    img_size = input.width
-    img_size_y = input.height
-    img_size_z = input.depth
-
-    config_assert(
-        img_size * img_size_y * img_size_z == img_pixels,
-        "Input layer %s: Incorrect input image size %d * %d * %d for input image pixels %d"
-        % (input_layer_name, img_size, img_size_y, img_size_z, img_pixels))
-    return img_size, img_size_y, img_size_z
-
-
-def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
-    parse_image(bilinear, input_layer_name, bilinear_conf.image_conf)
-    bilinear_conf.out_size_x = bilinear.out_size_x
-    bilinear_conf.out_size_y = bilinear.out_size_y
-
-
-def parse_pool(pool, input_layer_name, pool_conf, ceil_mode, exclude_mode):
-    pool_conf.pool_type = pool.pool_type
-    config_assert(pool.pool_type in [
-        'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
-    ], "pool-type %s is not in " \
-              "['max-projection', 'avg-projection', 'max-pool-with-mask'," \
-                  "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)
-
-    pool_conf.channels = pool.channels
-    pool_conf.size_x = pool.size_x
-    pool_conf.stride = pool.stride
-
-    pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
-    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
-
-    pool_conf.img_size, pool_conf.img_size_y = \
-        get_img_size(input_layer_name, pool.channels)
-
-    config_assert(not pool.start, "start is deprecated in pooling.")
-
-    if pool.padding is not None:
-        pool_conf.padding = pool.padding
-    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
-                                         pool_conf.padding, pool_conf.stride,
-                                         not ceil_mode)
-    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
-                                         pool_conf.padding_y,
-                                         pool_conf.stride_y, not ceil_mode)
-    if exclude_mode != None:
-        pool_conf.exclude_mode = exclude_mode
-
-
-def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode):
-    pool_conf.pool_type = pool.pool_type
-    config_assert(pool.pool_type in ['max-projection', 'avg-projection'],
-                  "pool-type %s is not in "
-                  "['max-projection', 'avg-projection']" % pool.pool_type)
-
-    pool_conf.channels = pool.channels
-
-    pool_conf.size_x = pool.size_x
-    pool_conf.stride = pool.stride
-    pool_conf.padding = pool.padding
-
-    pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
-    pool_conf.size_z = default(pool.size_z, pool_conf.size_x)
-    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
-    pool_conf.stride_z = default(pool.stride_z, pool_conf.stride)
-    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
-
-    pool_conf.img_size, pool_conf.img_size_y, pool_conf.img_size_z = \
-        get_img3d_size(input_layer_name, pool.channels)
-
-    config_assert(not pool.start, "start is deprecated in pooling.")
-
-    if pool.padding is not None:
-        pool_conf.padding = pool.padding
-    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
-    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
-    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
-                                         pool_conf.padding, pool_conf.stride,
-                                         not ceil_mode)
-    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
-                                         pool_conf.padding_y,
-                                         pool_conf.stride_y, not ceil_mode)
-    pool_conf.output_z = cnn_output_size(pool_conf.img_size_z, pool_conf.size_z,
-                                         pool_conf.padding_z,
-                                         pool_conf.stride_z, not ceil_mode)
-
-
-def parse_spp(spp, input_layer_name, spp_conf):
-    parse_image(spp, input_layer_name, spp_conf.image_conf)
-    spp_conf.pool_type = spp.pool_type
-    config_assert(spp.pool_type in ['max-projection', 'avg-projection'],
-                  "pool-type %s is not in "
-                  "['max-projection', 'avg-projection']" % spp.pool_type)
-    spp_conf.pyramid_height = spp.pyramid_height
-
-
-def parse_image(image, input_layer_name, image_conf):
-    image_conf.channels = image.channels
-    image_conf.img_size, image_conf.img_size_y = \
-        get_img_size(input_layer_name, image_conf.channels)
-
-
-def parse_image3d(image, input_layer_name, image_conf):
-    image_conf.channels = image.channels
-    image_conf.img_size, image_conf.img_size_y, image_conf.img_size_z = \
-        get_img3d_size(input_layer_name, image_conf.channels)
-
-
-def parse_norm(norm, input_layer_name, norm_conf):
-    norm_conf.norm_type = norm.norm_type
-    config_assert(
-        norm.norm_type in
-        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
-        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
-        % norm.norm_type)
-    norm_conf.channels = norm.channels
-    norm_conf.size = norm.size
-    norm_conf.scale = norm.scale
-    norm_conf.pow = norm.pow
-    norm_conf.blocked = norm.blocked
-
-    norm_conf.img_size, norm_conf.img_size_y = \
-        get_img_size(input_layer_name, norm.channels)
-    norm_conf.output_x = norm_conf.img_size
-    norm_conf.output_y = norm_conf.img_size_y
-    if norm.norm_type in ['cmrnorm-projection']:
-        norm_conf.scale /= norm.size
-    else:
-        norm_conf.scale /= norm.size**2
-
-
-#caffe_mode: compute the output size using floor instead of ceil,
-#            which is consistent of caffe and CuDNN's convention.
-def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
-    conv_conf.filter_size = conv.filter_size
-    conv_conf.filter_size_y = conv.filter_size_y
-    conv_conf.channels = conv.channels
-    conv_conf.padding = conv.padding
-    conv_conf.padding_y = conv.padding_y
-    conv_conf.stride = conv.stride
-    conv_conf.stride_y = conv.stride_y
-    conv_conf.groups = conv.groups
-    conv_conf.caffe_mode = conv.caffe_mode
-    if not conv.dilation:
-        conv.dilation = 1
-        conv.dilation_y = 1
-    else:
-        conv_conf.dilation = conv.dilation
-        conv_conf.dilation_y = conv.dilation_y
-
-    if not trans:
-        conv_conf.filter_channels = conv.channels / conv.groups
-        conv_conf.img_size, conv_conf.img_size_y = \
-            get_img_size(input_layer_name, conv.channels)
-        conv_conf.output_x = cnn_output_size(
-            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
-        conv_conf.output_y = cnn_output_size(
-            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
-    else:
-        conv_conf.filter_channels = num_filters / conv.groups
-        conv_conf.output_x, conv_conf.output_y = \
-            get_img_size(input_layer_name, conv.channels)
-        conv_conf.img_size = cnn_image_size(
-            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
-        conv_conf.img_size_y = cnn_image_size(
-            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
-
-
-#caffe_mode: compute the output size using floor instead of ceil,
-#            which is consistent of caffe and CuDNN's convention.
-def parse_conv3d(conv, input_layer_name, conv_conf, num_filters, trans=False):
-    conv_conf.filter_size = conv.filter_size
-    conv_conf.filter_size_y = conv.filter_size_y
-    conv_conf.filter_size_z = conv.filter_size_z
-    conv_conf.channels = conv.channels
-    conv_conf.padding = conv.padding
-    conv_conf.padding_y = conv.padding_y
-    conv_conf.padding_z = conv.padding_z
-    conv_conf.stride = conv.stride
-    conv_conf.stride_y = conv.stride_y
-    conv_conf.stride_z = conv.stride_z
-    conv_conf.groups = conv.groups
-    conv_conf.caffe_mode = conv.caffe_mode
-
-    if not trans:
-        conv_conf.filter_channels = conv.channels / conv.groups
-        conv_conf.img_size, conv_conf.img_size_y, conv_conf.img_size_z = \
-            get_img3d_size(input_layer_name, conv.channels)
-        conv_conf.output_x = cnn_output_size(
-            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
-        conv_conf.output_y = cnn_output_size(
-            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
-        conv_conf.output_z = cnn_output_size(
-            conv_conf.img_size_z, conv_conf.filter_size_z, conv_conf.padding_z,
-            conv_conf.stride_z, conv_conf.caffe_mode)
-    else:
-        conv_conf.filter_channels = num_filters / conv.groups
-        conv_conf.output_x, conv_conf.output_y, conv_conf.output_z = \
-            get_img3d_size(input_layer_name, conv.channels)
-        conv_conf.img_size = cnn_image_size(
-            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
-        conv_conf.img_size_y = cnn_image_size(
-            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
-        conv_conf.img_size_z = cnn_image_size(
-            conv_conf.output_z, conv_conf.filter_size_z, conv_conf.padding_z,
-            conv_conf.stride_z, conv_conf.caffe_mode)
-
-
-def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
-    block_expand_conf.channels = block_expand.channels
-    block_expand_conf.stride_x = block_expand.stride_x
-    block_expand_conf.stride_y = block_expand.stride_y
-    block_expand_conf.padding_x = block_expand.padding_x
-    block_expand_conf.padding_y = block_expand.padding_y
-    block_expand_conf.block_x = block_expand.block_x
-    block_expand_conf.block_y = block_expand.block_y
-    block_expand_conf.img_size_x = block_expand.img_size_x
-    block_expand_conf.img_size_y = block_expand.img_size_y
-    if block_expand_conf.img_size_x == 0:
-        block_expand_conf.output_x = 0
-    else:
-        block_expand_conf.output_x = cnn_output_size(
-            block_expand.img_size_x, block_expand.block_x,
-            block_expand.padding_x, block_expand.stride_x, False)
-
-    if block_expand_conf.img_size_y == 0:
-        block_expand_conf.output_y = 0
-    else:
-        block_expand_conf.output_y = cnn_output_size(
-            block_expand.img_size_y, block_expand.block_y,
-            block_expand.padding_y, block_expand.stride_y, False)
-
-
-def parse_maxout(maxout, input_layer_name, maxout_conf):
-    parse_image(maxout, input_layer_name, maxout_conf.image_conf)
-    maxout_conf.groups = maxout.groups
-
-
-# Define an evaluator
-@config_func
-def Evaluator(name,
-              type,
-              inputs,
-              chunk_scheme=None,
-              num_chunk_types=None,
-              classification_threshold=None,
-              positive_label=None,
-              dict_file=None,
-              result_file=None,
-              num_results=None,
-              top_k=None,
-              delimited=None,
-              excluded_chunk_types=None,
-              overlap_threshold=None,
-              background_id=None,
-              evaluate_difficult=None,
-              ap_type=None):
-    evaluator = g_config.model_config.evaluators.add()
-    evaluator.type = type
-    evaluator.name = MakeLayerNameInSubmodel(name)
-    if type_of(inputs) == str:
-        inputs = [inputs]
-
-    evaluator.input_layers.extend(
-        [MakeLayerNameInSubmodel(name) for name in inputs])
-
-    if chunk_scheme is not None:
-        evaluator.chunk_scheme = chunk_scheme
-        evaluator.num_chunk_types = num_chunk_types
-    g_current_submodel.evaluator_names.append(evaluator.name)
-
-    if classification_threshold is not None:
-        evaluator.classification_threshold = classification_threshold
-    if positive_label is not None:
-        evaluator.positive_label = positive_label
-    if dict_file is not None:
-        evaluator.dict_file = dict_file
-
-    if result_file is not None:
-        evaluator.result_file = result_file
-    if num_results is not None:
-        evaluator.num_results = num_results
-    if top_k is not None:
-        evaluator.top_k = top_k
-    if delimited is not None:
-        evaluator.delimited = delimited
-
-    if excluded_chunk_types:
-        evaluator.excluded_chunk_types.extend(excluded_chunk_types)
-
-    if overlap_threshold is not None:
-        evaluator.overlap_threshold = overlap_threshold
-
-    if background_id is not None:
-        evaluator.background_id = background_id
-
-    if evaluate_difficult is not None:
-        evaluator.evaluate_difficult = evaluate_difficult
-
-    if ap_type is not None:
-        evaluator.ap_type = ap_type
-
-
-class LayerBase(object):
-    def __init__(
-            self,
-            name,
-            type,
-            size,  # size can be 0. In this case, subclass should set it.
-            inputs,
-            device=None,
-            active_type="",
-            drop_rate=0.,
-            coeff=None,
-            error_clipping_threshold=None):
-        config_assert('@' not in name,
-                      "layer name: %s contain special character @" % name)
-        global g_current_submodel
-        name = MakeLayerNameInSubmodel(name)
-
-        config_assert(name not in g_layer_map,
-                      'Duplicated layer name: %s' % name)
-
-        self.inputs = copy.deepcopy(inputs)
-        self.operators = []
-
-        if self.inputs is None:
-            self.inputs = []
-        elif type_of(self.inputs) != list:
-            self.inputs = [self.inputs]
-
-        self.config = g_config.model_config.layers.add()
-        assert isinstance(self.config, LayerConfig)
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        mkldnn_acts = ['relu', 'tanh', 'softmax']
-        if use_mkldnn and active_type in mkldnn_acts:
-            active_type = "mkldnn_" + active_type
-        self.config.name = name
-        self.config.type = type
-        self.config.active_type = active_type
-        if coeff is not None:
-            self.config.coeff = float(coeff)
-        if size != 0:
-            self.config.size = size
-        if drop_rate != 0:
-            self.config.drop_rate = drop_rate
-
-        if device is not None:
-            self.config.device = device
-        elif g_default_device is not None:
-            self.config.device = g_default_device
-
-        if error_clipping_threshold is not None:
-            self.config.error_clipping_threshold = error_clipping_threshold
-
-        for input_index in xrange(len(self.inputs)):
-            input = self.inputs[input_index]
-            input_config = None
-            input_layer_name = ''
-            if type_of(input) == str:
-                input_layer_name = input
-                input_config = Input(
-                    input_layer_name=input,
-                    parameter_name=gen_parameter_name(name, input_index))
-                input_layer_name = input_config.input_layer_name
-            elif isinstance(input, Input):
-                input_layer_name = input.input_layer_name
-                input_config = input
-                if input_config.parameter_name is None:
-                    input_config.parameter_name = \
-                        gen_parameter_name(name, input_index)
-            elif isinstance(input, Operator):
-                self.operators.append(input)
-                input.operator_conf.input_indices.append(input_index)
-                input_config = Input(input.input_layer_names[0])
-                input_layer_name = input_config.input_layer_name
-            else:
-                raise ValueError('Wrong type for inputs: %s' % type_of(input))
-            config_assert(input_layer_name in g_layer_map,
-                          "Unknown input layer '%s' for layer %s" %
-                          (input_layer_name, name))
-            self.inputs[input_index] = input_config
-            layer_input = self.config.inputs.add()
-            layer_input.input_layer_name = input_config.input_layer_name
-            if input_config.input_layer_argument is not None:
-                layer_input.input_layer_argument = \
-                    input_config.input_layer_argument
-
-        g_layer_map[name] = self.config
-
-        g_current_submodel.layer_names.append(self.config.name)
-
-    def get_input_layer(self, input_index):
-        return g_layer_map[self.config.inputs[input_index].input_layer_name]
-
-    # will return the bias created if not *for_self*
-    def create_bias_parameter(
-            self,
-            bias,  # True/False or BiasCfg
-            size,
-            dims=None,
-            for_self=True,  # whether create bias for layer self
-    ):
-
-        if size == 0:
-            return
-        if dims is None:
-            dims = [1, size]
-
-        config_assert(
-            type_of(bias) == bool or type_of(bias) == Bias,
-            'Incorrect type for bias: %s' % type_of(bias))
-
-        if type_of(bias) == bool:
-            if bias:
-                bias = Bias()
-
-        if type_of(bias) == Bias:
-            if bias.parameter_name is None:
-                bias.parameter_name = gen_bias_parameter_name(self.config.name)
-            if bias.parameter_name not in g_parameter_map:
-                assert isinstance(self.config, LayerConfig)
-
-                Parameter(
-                    bias.parameter_name,
-                    size,
-                    self.config.device
-                    if self.config.HasField('device') else None,
-                    dims,
-                    bias.learning_rate,
-                    bias.momentum,
-                    decay_rate=bias.decay_rate,
-                    decay_rate_l1=bias.decay_rate_l1,
-                    initial_mean=bias.initial_mean,
-                    initial_std=bias.initial_std,
-                    initial_strategy=bias.initial_strategy,
-                    initial_smart=bias.initial_smart,
-                    num_batches_regularization=bias.num_batches_regularization,
-                    sparse_remote_update=bias.sparse_remote_update,
-                    gradient_clipping_threshold=bias.
-                    gradient_clipping_threshold,
-                    is_static=bias.is_static,
-                    is_shared=bias.is_shared,
-                    initializer=bias.initializer)
-            if for_self:
-                self.config.bias_parameter_name = bias.parameter_name
-            else:
-                return bias.parameter_name
-
-    def create_input_parameter(self,
-                               input_index,
-                               size,
-                               dims=None,
-                               sparse=None,
-                               format=None):
-        if dims is None:
-            # TODO(yuyang18): print warning and callstack here!
-            dims = list()
-
-        if size == 0:
-            return
-
-        input_config = self.inputs[input_index]
-
-        self.config.inputs[input_index].input_parameter_name = \
-            input_config.parameter_name
-
-        if input_config.parameter_name in g_parameter_map:
-            para = g_parameter_map[input_config.parameter_name]
-            config_assert(size == para.size, (
-                'Shared parameter "%s" does not ' + 'have same size: %s vs. %s')
-                          % (input_config.parameter_name, para.size, size))
-
-            config_assert(dims == para.dims, (
-                'Shared parameter "%s" does not ' + 'have same dims: %s vs. %s')
-                          % (input_config.parameter_name, para.dims, dims))
-            return
-
-        Parameter(
-            input_config.parameter_name,
-            size,
-            self.config.device if self.config.HasField("device") else None,
-            dims,
-            input_config.learning_rate,
-            input_config.momentum,
-            decay_rate=input_config.decay_rate,
-            decay_rate_l1=input_config.decay_rate_l1,
-            initial_mean=input_config.initial_mean,
-            initial_std=input_config.initial_std,
-            initial_strategy=input_config.initial_strategy,
-            initial_smart=input_config.initial_smart,
-            num_batches_regularization=input_config.num_batches_regularization,
-            sparse_remote_update=input_config.sparse_remote_update,
-            sparse_update=input_config.sparse_update,
-            gradient_clipping_threshold=input_config.
-            gradient_clipping_threshold,
-            sparse=sparse,
-            format=format,
-            is_static=input_config.is_static,
-            is_shared=input_config.is_shared,
-            update_hooks=input_config.update_hooks,
-            initializer=input_config.initializer)
-
-    def set_layer_size(self, size):
-        if self.config.size == 0:
-            self.config.size = size
-        else:
-            config_assert(self.config.size == size,
-                          'Different inputs result in' +
-                          'different layer size at layer %s' % self.config.name)
-
-    def set_layer_height_width(self, height, width):
-        self.config.height = height
-        self.config.width = width
-
-    def set_layer_depth(self, depth):
-        self.config.depth = depth
-
-    def set_cnn_layer(self,
-                      input_layer_name,
-                      height,
-                      width,
-                      channels,
-                      is_print=True):
-        size = height * width * channels
-        self.set_layer_size(size)
-        self.set_layer_height_width(height, width)
-        if is_print:
-            print("output for %s: c = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, height, width, size))
-
-
-@config_layer('multi_class_cross_entropy_with_selfnorm')
-class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
-    def __init__(self, name, inputs, softmax_selfnorm_alpha=0.1, **xargs):
-        super(MultiClassCrossEntropySelfNormCostLayer, self).__init__(
-            name, 'multi_class_cross_entropy_with_selfnorm', 0, inputs, **xargs)
-        self.config.softmax_selfnorm_alpha = softmax_selfnorm_alpha
-
-
-@config_layer('cross_entropy_over_beam')
-class CrossEntropyOverBeamLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        config_assert(len(inputs) % 3 == 0, "Error input number.")
-        super(CrossEntropyOverBeamLayer, self).__init__(
-            name, 'cross_entropy_over_beam', 0, inputs, **xargs)
-        input_num = len(inputs) / 3
-        for i in range(input_num):
-            input_layer = self.get_input_layer(i * 3)
-            config_assert(input_layer.size == 1, (
-                "Inputs for this layer are made up of "
-                "several triples, in which the first one is scores over "
-                "all candidate paths, whose size should be equal to 1."))
-
-
-@config_layer('fc')
-class FCLayer(LayerBase):
-    layer_type = 'fc'
-
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 bias=True,
-                 error_clipping_threshold=None,
-                 **xargs):
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        use_mkldnn_wgt = bool(
-            int(g_command_config_args.get("use_mkldnn_wgt", 0)))
-        if use_mkldnn:
-            self.layer_type = 'mkldnn_fc'
-            config_assert(
-                len(inputs) == 1,
-                "MKLDNNFCLayer support one and only one input!")
-        super(FCLayer, self).__init__(
-            name, self.layer_type, size, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            psize = self.config.size * input_layer.size
-            dims = [input_layer.size, self.config.size]
-            format = self.inputs[input_index].format
-            sparse = format == "csr" or format == "csc"
-            if use_mkldnn:
-                config_assert(not sparse,
-                              "MKLDNNFCLayer do not support sparse format yet")
-                if use_mkldnn_wgt:
-                    dims = [self.config.size, input_layer.size]
-            if sparse:
-                psize = self.inputs[input_index].nnz
-            else:
-                sparse = None
-
-            self.create_input_parameter(input_index, psize, dims, sparse,
-                                        format)
-        self.create_bias_parameter(bias, self.config.size)
-        if error_clipping_threshold is not None:
-            self.config.error_clipping_threshold = error_clipping_threshold
-
-
-@config_layer('mkldnn_fc')
-class MKLDNNFcLayer(FCLayer):
-    layer_type = 'mkldnn_fc'
-
-
-@config_layer('selective_fc')
-class SelectiveFCLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 bias=True,
-                 selective_fc_pass_generation=False,
-                 has_selected_colums=True,
-                 selective_fc_full_mul_ratio=0.02,
-                 selective_fc_parallel_plain_mul_thread_num=None,
-                 **xargs):
-        super(SelectiveFCLayer, self).__init__(
-            name, 'selective_fc', size, inputs=inputs, **xargs)
-        # user MUST know if selctive fc is used in training,
-        # parameter matrices saved by this layer are automatically transposed,
-        # BUT bias is not.
-
-        # if selective_fc is used only in testing mode, and parameters for
-        # this layer are trained by fully connected layers,
-        # then TranposedFullMatrixProjectin MUST be used in training
-        # to avoid manual transpose in testing.
-
-        self.config.selective_fc_pass_generation = selective_fc_pass_generation
-        self.config.has_selected_colums = has_selected_colums
-        self.config.selective_fc_full_mul_ratio = selective_fc_full_mul_ratio
-        if selective_fc_parallel_plain_mul_thread_num is not None:
-            self.config.selective_fc_parallel_plain_mul_thread_num = selective_fc_parallel_plain_mul_thread_num
-
-        input_num = len(self.inputs)
-        if has_selected_colums:
-            config_assert(input_num >= 2,
-                          ("if indices of selected columns are not specified, "
-                           "selective_fc Layer has at least two inputs"))
-            input_num -= 1
-
-        for input_index in xrange(input_num):
-            input_layer = self.get_input_layer(input_index)
-            psize = self.config.size * input_layer.size
-            dims = [input_layer.size, self.config.size]
-            dims = dims[::-1]  # transpose the parameter
-            format = self.inputs[input_index].format
-            sparse = format == "csr" or format == "csc"
-            if sparse:
-                psize = self.inputs[input_index].nnz
-
-            self.create_input_parameter(input_index, psize, dims, sparse,
-                                        format)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('print')
-class PrintLayer(LayerBase):
-    def __init__(self, name, inputs, format=None):
-        super(PrintLayer, self).__init__(name, 'print', 0, inputs)
-        if format is None:
-            format = "\n".join([
-                "layer=" + input.input_layer_name + " %s"
-                for input in self.inputs
-            ])
-        self.config.user_arg = format
-
-
-@config_layer('priorbox')
-class PriorBoxLayer(LayerBase):
-    def __init__(self, name, inputs, size, min_size, max_size, aspect_ratio,
-                 variance):
-        super(PriorBoxLayer, self).__init__(name, 'priorbox', 0, inputs)
-        config_assert(len(inputs) == 2, 'PriorBoxLayer must have 2 inputs')
-        input_layer = self.get_input_layer(1)
-        config_assert(
-            input_layer.type == 'data',
-            'Expecting the second input layer of an priorbox layer to be '
-            'a data layer')
-        config_assert(input_layer.width > 0, 'The data layer must set width')
-        config_assert(input_layer.height > 0, 'The data layer must set height')
-        config_assert(len(variance) == 4, 'The variance must have 4 inputs')
-        self.config.inputs[0].priorbox_conf.min_size.extend(min_size)
-        self.config.inputs[0].priorbox_conf.max_size.extend(max_size)
-        self.config.inputs[0].priorbox_conf.aspect_ratio.extend(aspect_ratio)
-        self.config.inputs[0].priorbox_conf.variance.extend(variance)
-        self.config.size = size
-
-
-@config_layer('multibox_loss')
-class MultiBoxLossLayer(LayerBase):
-    def __init__(self, name, inputs, input_num, num_classes, overlap_threshold,
-                 neg_pos_ratio, neg_overlap, background_id, **xargs):
-        super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0,
-                                                inputs)
-        config_assert(
-            len(inputs) == (input_num * 2 + 2),
-            'MultiBoxLossLayer does not have enough inputs')
-        config_assert(num_classes > background_id,
-                      'Classes number must greater than background ID')
-        self.config.inputs[0].multibox_loss_conf.num_classes = num_classes
-        self.config.inputs[
-            0].multibox_loss_conf.overlap_threshold = overlap_threshold
-        self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio
-        self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap
-        self.config.inputs[0].multibox_loss_conf.background_id = background_id
-        self.config.inputs[0].multibox_loss_conf.input_num = input_num
-        self.config.size = 1
-
-
-@config_layer('detection_output')
-class DetectionOutputLayer(LayerBase):
-    def __init__(self, name, inputs, size, input_num, num_classes,
-                 nms_threshold, nms_top_k, keep_top_k, confidence_threshold,
-                 background_id, **xargs):
-        super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0,
-                                                   inputs)
-        config_assert(
-            len(inputs) == (input_num * 2 + 1),
-            'DetectionOutputLayer does not have enough inputs')
-        config_assert(num_classes > background_id,
-                      'Classes number must greater than background ID')
-        self.config.inputs[0].detection_output_conf.num_classes = num_classes
-        self.config.inputs[
-            0].detection_output_conf.nms_threshold = nms_threshold
-        self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k
-        self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k
-        self.config.inputs[
-            0].detection_output_conf.confidence_threshold = confidence_threshold
-        self.config.inputs[
-            0].detection_output_conf.background_id = background_id
-        self.config.inputs[0].detection_output_conf.input_num = input_num
-        self.config.size = size
-
-
-@config_layer('roi_pool')
-class ROIPoolLayer(LayerBase):
-    def __init__(self, name, inputs, pooled_width, pooled_height, spatial_scale,
-                 num_channels, **xargs):
-        super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs)
-        config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs')
-        self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width
-        self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height
-        self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale
-        self.set_cnn_layer(name, pooled_height, pooled_width, num_channels)
-
-
-@config_layer('data')
-class DataLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 depth=None,
-                 height=None,
-                 width=None,
-                 device=None):
-        super(DataLayer, self).__init__(
-            name, 'data', size, inputs=[], device=device)
-        if height and width:
-            self.set_layer_height_width(height, width)
-        if depth:
-            self.set_layer_depth(depth)
-
-
-'''
-DataNormLayer: A layer for data normalization
-Input: One and only one input layer is accepted. The input layer must
-       be DataLayer with dense data type
-Output: The normalization of the input data
-
-Reference:
-    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
-
-Example:
-    Layer(
-        name = "norm_input_layer",
-        type = "data_norm",
-        inputs = [Input("input_layer",
-                        parameter_name = "_slot0.stats")],
-        data_norm_strategy = "z-score",
-    )
-
-Note:
-  (1) The parameter has been calculated in the preprocessing stage,
-      and should be initialized by --init_model_path when training.
-  (2) Three data normalization methoeds are considered
-          z-score: y = (x-mean)/std
-          min-max: y = (x-min)/(max-min)
-          decimal-scaling: y = x/10^j, where j is the smallest integer such that max(|y|)<1
-'''
-
-
-@config_layer('data_norm')
-class DataNormLayer(LayerBase):
-    def __init__(self, name, inputs, data_norm_strategy="z-score", device=None):
-        super(DataNormLayer, self).__init__(
-            name, 'data_norm', 0, inputs=inputs, device=device)
-        self.config.data_norm_strategy = data_norm_strategy
-        config_assert(len(inputs) == 1, 'DataNormLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        self.set_layer_size(input_layer.size)
-        para_size = 5 * input_layer.size
-        para_dims = [5, input_layer.size]
-        self.inputs[0].is_static = True
-        self.create_input_parameter(0, para_size, para_dims)
-
-
-@config_layer('prelu')
-class ParameterReluLayer(LayerBase):
-    layer_type = 'prelu'
-
-    def __init__(self, name, inputs, partial_sum=1, **args):
-        super(ParameterReluLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **args)
-
-        input_layer = self.get_input_layer(0)
-        config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
-        config_assert(input_layer.size % partial_sum == 0,
-                      "a wrong setting for partial_sum")
-
-        dims = [1, input_layer.size / partial_sum]
-        self.set_layer_size(input_layer.size)
-        self.config.partial_sum = partial_sum
-        self.create_input_parameter(0, input_layer.size / partial_sum, dims)
-
-        self.set_layer_height_width(self.get_input_layer(0).height, \
-                                        self.get_input_layer(0).width)
-        self.set_layer_depth(self.get_input_layer(0).depth)
-
-
-@config_layer('conv')
-class ConvLayerBase(LayerBase):
-    layer_type = 'conv'
-
-    def __init__(self,
-                 name,
-                 inputs=[],
-                 bias=True,
-                 num_filters=None,
-                 shared_biases=False,
-                 **xargs):
-        super(ConvLayerBase, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-
-        if num_filters is not None:
-            self.config.num_filters = num_filters
-
-        use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
-        use_gpu = int(g_command_config_args.get("use_gpu", 0))
-        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
-
-        # Automatically select cudnn_type for GPU, exconv for CPU
-        # and mkldnn_conv for MKLDNN
-        # if set type=conv, but still reserve the way user specify
-        # exconv, mkldnn_conv or cudnn_conv manually.
-        if self.layer_type == "cudnn_conv":
-            config_assert(use_gpu, "cudnn_conv only support GPU")
-
-        if self.layer_type == "mkldnn_conv":
-            config_assert(use_mkldnn, "mkldnn_conv only support MKLDNN")
-
-        if (use_gpu == 1 and self.layer_type != "exconv" and
-                self.layer_type != "mkldnn_conv" and
-            (parallel_nn == 0 or self.config.device > -1)):
-            self.layer_type = "cudnn_conv"
-        else:
-            self.layer_type = "mkldnn_conv" if use_mkldnn else "exconv"
-        # need to specify layer in config
-        self.config.type = self.layer_type
-
-        if shared_biases is not None:
-            self.config.shared_biases = shared_biases
-
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            conv_conf = self.config.inputs[input_index].conv_conf
-            parse_conv(self.inputs[input_index].conv, input_layer.name,
-                       conv_conf, num_filters)
-            psize = self.calc_parameter_size(conv_conf)
-            self.create_input_parameter(input_index, psize)
-            self.set_cnn_layer(name, conv_conf.output_y, conv_conf.output_x,
-                               self.config.num_filters)
-
-        psize = self.config.size
-        if shared_biases:
-            psize = self.config.num_filters
-        self.create_bias_parameter(bias, psize, [psize, 1])
-
-    def calc_parameter_size(self, conv_conf):
-        return self.config.num_filters * conv_conf.filter_channels \
-               * (conv_conf.filter_size * conv_conf.filter_size_y)
-
-
-@config_layer('exconv')
-class ConvLayer(ConvLayerBase):
-    layer_type = 'exconv'
-
-
-@config_layer('mkldnn_conv')
-class ConvLayer(ConvLayerBase):
-    layer_type = 'mkldnn_conv'
-
-
-@config_layer('cudnn_conv')
-class ConvLayer(ConvLayerBase):
-    layer_type = 'cudnn_conv'
-
-
-@config_layer('convt')
-class ConvTransLayerBase(LayerBase):
-    layer_type = 'convt'
-
-    def __init__(self,
-                 name,
-                 inputs=[],
-                 bias=True,
-                 num_filters=None,
-                 shared_biases=False,
-                 **xargs):
-        super(ConvTransLayerBase, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-
-        if num_filters is not None:
-            self.config.num_filters = num_filters
-
-        use_gpu = int(g_command_config_args.get("use_gpu", 0))
-        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
-
-        # Automatically select cudnn_type for GPU and exconvt for CPU
-        # if set type=exconvt, but still reserve the way user specify
-        # exconvt or cudnn_convt manually.
-        if self.layer_type == "cudnn_convt":
-            config_assert(use_gpu, "cudnn_convt only support GPU")
-
-        if (use_gpu == 1 and self.layer_type != "exconvt" and
-            (parallel_nn == 0 or self.config.device > -1)):
-            self.layer_type = "cudnn_convt"
-        else:
-            self.layer_type = "exconvt"
-        # need to specify layer in config
-        self.config.type = self.layer_type
-
-        if shared_biases is not None:
-            self.config.shared_biases = shared_biases
-
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            parse_conv(
-                self.inputs[input_index].conv,
-                input_layer.name,
-                self.config.inputs[input_index].conv_conf,
-                num_filters,
-                trans=True)
-            conv_conf = self.config.inputs[input_index].conv_conf
-            psize = self.calc_parameter_size(conv_conf)
-            self.create_input_parameter(input_index, psize)
-            self.set_cnn_layer(name, conv_conf.img_size_y, conv_conf.img_size,
-                               self.config.num_filters)
-
-        psize = self.config.size
-        if shared_biases:
-            psize = self.config.num_filters
-        self.create_bias_parameter(bias, psize, [psize, 1])
-
-    def calc_parameter_size(self, conv_conf):
-        return conv_conf.channels * conv_conf.filter_channels \
-                    * (conv_conf.filter_size * conv_conf.filter_size_y)
-
-
-@config_layer('exconvt')
-class ConvTransLayer(ConvTransLayerBase):
-    layer_type = 'exconvt'
-
-
-@config_layer('cudnn_convt')
-class ConvTransLayer(ConvTransLayerBase):
-    layer_type = 'cudnn_convt'
-
-
-@config_layer('conv_3d')
-class Conv3DLayerBase(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs=[],
-                 bias=True,
-                 num_filters=None,
-                 shared_biases=True,
-                 **xargs):
-        super(Conv3DLayerBase, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-
-        if num_filters is not None:
-            self.config.num_filters = num_filters
-
-        # need to specify layer in config
-        self.config.type = self.layer_type
-
-        trans = False
-        if self.config.type == "deconv3d":
-            trans = True
-
-        if shared_biases is not None:
-            self.config.shared_biases = shared_biases
-
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            conv_conf = self.config.inputs[input_index].conv_conf
-            parse_conv3d(
-                self.inputs[input_index].conv,
-                input_layer.name,
-                conv_conf,
-                num_filters,
-                trans=trans
-            )  # for z-axis pad:0, strid:1, filter_size:1, img_size:1
-            psize = self.calc_parameter_size(conv_conf)
-            self.create_input_parameter(input_index, psize)
-            if trans:
-                self.set_cnn_layer(name, conv_conf.img_size_z,
-                                   conv_conf.img_size_y, conv_conf.img_size,
-                                   self.config.num_filters)
-            else:
-                self.set_cnn_layer(name, conv_conf.output_z, conv_conf.output_y,
-                                   conv_conf.output_x, self.config.num_filters)
-
-        psize = self.config.size
-        if shared_biases:
-            psize = self.config.num_filters
-        self.create_bias_parameter(bias, psize, [psize, 1])
-
-    def calc_parameter_size(self, conv_conf):
-        return self.config.num_filters * conv_conf.filter_channels \
-               * (conv_conf.filter_size * conv_conf.filter_size_y \
-                  * conv_conf.filter_size_z)
-
-    def set_cnn_layer(self,
-                      input_layer_name,
-                      depth,
-                      height,
-                      width,
-                      channels,
-                      is_print=True):
-        size = depth * height * width * channels
-        self.set_layer_size(size)
-        self.set_layer_height_width(height, width)
-        self.set_layer_depth(depth)
-        if is_print:
-            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, depth, height, width, size))
-
-
-@config_layer('conv3d')
-class Conv3DLayer(Conv3DLayerBase):
-    layer_type = 'conv3d'
-
-
-@config_layer('deconv3d')
-class Conv3DLayer(Conv3DLayerBase):
-    layer_type = 'deconv3d'
-
-
-@config_layer('norm')
-class NormLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        use_mkldnn = True if use_mkldnn and self.inputs[
-            0].norm.norm_type == 'cmrnorm-projection' else False
-        self.config.type = 'mkldnn_lrn' if use_mkldnn else self.config.type
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            norm_conf = self.config.inputs[input_index].norm_conf
-            parse_norm(self.inputs[input_index].norm, input_layer.name,
-                       norm_conf)
-            norm_conf.scale = self.inputs[
-                input_index].norm.scale if use_mkldnn else norm_conf.scale
-            self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
-                               norm_conf.channels, False)
-            if norm_conf.norm_type == "cross-channel-norm":
-                self.create_input_parameter(0, norm_conf.channels,
-                                            [norm_conf.channels, 1])
-
-
-@config_layer('pool')
-class PoolLayer(LayerBase):
-    layer_type = 'pool'
-
-    def __init__(self, name, inputs, ceil_mode=True, exclude_mode=None,
-                 **xargs):
-        use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
-        if self.layer_type == "mkldnn_pool":
-            config_assert(use_mkldnn, "mkldnn_pool only support MKLDNN")
-        self.layer_type = 'mkldnn_pool' if use_mkldnn else 'pool'
-        super(PoolLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            pool_conf = self.config.inputs[input_index].pool_conf
-            parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf, ceil_mode, exclude_mode)
-            self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
-                               pool_conf.channels)
-
-
-@config_layer('mkldnn_pool')
-class MKLDNNPoolLayer(PoolLayer):
-    layer_type = 'mkldnn_pool'
-
-
-@config_layer('pool3d')
-class Pool3DLayer(LayerBase):
-    def __init__(self, name, inputs, ceil_mode=True, **xargs):
-        super(Pool3DLayer, self).__init__(
-            name, 'pool3d', 0, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            pool_conf = self.config.inputs[input_index].pool_conf
-            parse_pool3d(self.inputs[input_index].pool, input_layer.name,
-                         pool_conf, ceil_mode)
-            self.set_cnn_layer(name, pool_conf.output_z, pool_conf.output_y,
-                               pool_conf.output_x, pool_conf.channels)
-
-    def set_cnn_layer(self,
-                      input_layer_name,
-                      depth,
-                      height,
-                      width,
-                      channels,
-                      is_print=True):
-        size = depth * height * width * channels
-        self.set_layer_size(size)
-        self.set_layer_height_width(height, width)
-        self.set_layer_depth(depth)
-        if is_print:
-            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, depth, height, width, size))
-
-
-@config_layer('spp')
-class SpatialPyramidPoolLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(SpatialPyramidPoolLayer, self).__init__(
-            name, 'spp', 0, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            spp_conf = self.config.inputs[input_index].spp_conf
-            parse_spp(self.inputs[input_index].spp, input_layer.name, spp_conf)
-            output_x = (pow(4, spp_conf.pyramid_height) - 1) / (4 - 1)
-            self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
-
-
-@config_layer('upsample')
-class UpsampleLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(UpsampleLayer, self).__init__(
-            name, 'upsample', 0, inputs=inputs, **xargs)
-
-        input_layer = self.get_input_layer(0)
-        image_conf = self.config.inputs[0].upsample_conf.image_conf
-        image_conf.img_size = input_layer.width
-        image_conf.img_size_y = input_layer.height
-        image_conf.channels = input_layer.size / (input_layer.width *
-                                                  input_layer.height)
-
-        upsample = self.inputs[0].upsample
-        output_x = 0
-        output_y = 0
-        output_size = 0
-
-        if upsample.scale:
-            self.config.inputs[0].upsample_conf.scale = upsample.scale
-            self.config.inputs[0].upsample_conf.scale_y = upsample.scale_y
-            output_x = input_layer.width * upsample.scale
-            output_y = input_layer.height * upsample.scale_y
-        self.config.inputs[0].upsample_conf.pad_out_x = upsample.pad_out_x
-        self.config.inputs[0].upsample_conf.pad_out_y = upsample.pad_out_y
-        if upsample.upsample_size:
-            self.config.inputs[
-                0].upsample_conf.upsample_size = upsample.upsample_size
-            self.config.inputs[
-                0].upsample_conf.upsample_size_y = upsample.upsample_size_y
-            output_x = upsample.upsample_size
-            output_y = upsample.upsample_size_y
-
-        output_size = image_conf.channels * output_x * output_y
-
-        self.set_layer_height_width(output_y, output_x)
-        self.set_layer_depth(input_layer.depth)
-        self.set_layer_size(output_size)
-
-
-@config_layer('pad')
-class PadLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(PadLayer, self).__init__(name, 'pad', 0, inputs=inputs, **xargs)
-        pad = self.inputs[0].pad
-        self.config.inputs[0].pad_conf.pad_c.extend(pad.pad_c)
-        self.config.inputs[0].pad_conf.pad_h.extend(pad.pad_h)
-        self.config.inputs[0].pad_conf.pad_w.extend(pad.pad_w)
-
-        input_layer = self.get_input_layer(0)
-        image_conf = self.config.inputs[0].pad_conf.image_conf
-        parse_image(pad, input_layer.name, image_conf)
-        out_ch = pad.channels + pad.pad_c[0] + pad.pad_c[1]
-        out_h = image_conf.img_size_y + pad.pad_h[0] + pad.pad_h[1]
-        out_w = image_conf.img_size + pad.pad_w[0] + pad.pad_w[1]
-        self.set_cnn_layer(name, out_h, out_w, out_ch)
-        self.config.size = out_ch * out_h * out_w
-
-
-@config_layer('crop')
-class CropLayer(LayerBase):
-    def __init__(self, name, inputs, axis, offset, shape, **xargs):
-        super(CropLayer, self).__init__(name, 'crop', 0, inputs=inputs, **xargs)
-        self.config.axis = axis
-        self.config.offset.extend(offset)
-        self.config.shape.extend(shape)
-
-        # get channel, width and height from input_0 layer
-        input_layer = self.get_input_layer(0)
-        image_conf = self.config.inputs[0].image_conf
-        image_conf.img_size = input_layer.width
-        image_conf.img_size_y = input_layer.height
-        image_conf.channels = input_layer.size / (input_layer.width *
-                                                  input_layer.height)
-        # only support for 4-dims inputs and NCHW order
-        if (len(self.config.inputs) == 2):
-            self.set_layer_height_width(
-                self.get_input_layer(1).height, self.get_input_layer(1).width)
-            self.set_layer_size(self.get_input_layer(1).size)
-        else:
-            self.set_layer_height_width(shape[-2], shape[-1])
-            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
-
-
-@config_layer('batch_norm')
-class BatchNormLayer(LayerBase):
-    layer_type = 'batch_norm'
-
-    def __init__(self,
-                 name,
-                 inputs,
-                 bias=True,
-                 img3D=False,
-                 use_global_stats=True,
-                 epsilon=1e-5,
-                 moving_average_fraction=0.9,
-                 batch_norm_type=None,
-                 mean_var_names=None,
-                 **xargs):
-        if inputs is None:
-            inputs = []
-        elif not isinstance(inputs, list):
-            inputs = [inputs]
-        config_assert(
-            len(inputs) == 1, "BatchNormLayer must have one and only one input")
-        # Create Input for moving mean and std,
-        # in batch normalization layer.
-        # These paras no need to update, so set is_static is true.
-        # If not use is_static, even set learning_rate = 0, decay_rate = 0,
-        # these paras will change if set average_window in configure.
-        use_gpu = bool(int(g_command_config_args.get("use_gpu", 0)))
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        is_shared = True if not use_gpu else False
-        for i in xrange(2):
-            inputs.append(
-                Input(
-                    inputs[0].input_layer_name,
-                    initial_std=0.0,
-                    initial_mean=0.0,
-                    is_static=True,
-                    is_shared=is_shared,
-                    make_layer_name_in_submodel=False, ))
-
-        parallel_nn = bool(int(g_command_config_args.get("parallel_nn", 0)))
-        cudnn_version = int(g_command_config_args.get("cudnn_version", 0))
-        # Automatically select cudnn_batch_norm for GPU, batch_norm for CPU
-        # and mkldnn_batch_norm for MKLDNN. Also based on cudnn version.
-        if batch_norm_type == "mkldnn_batch_norm":
-            config_assert(use_mkldnn, "mkldnn_batch_norm only support MKLDNN")
-        use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
-                not use_mkldnn and batch_norm_type != "mkldnn_batch_norm" and \
-                ((not parallel_nn) or self.config.device > -1)
-        if use_cudnn:
-            self.layer_type = "cudnn_batch_norm"
-        else:
-            self.layer_type = "mkldnn_batch_norm" if use_mkldnn else "batch_norm"
-        super(BatchNormLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-
-        if use_global_stats is not None:
-            self.config.use_global_stats = use_global_stats
-        if moving_average_fraction is not None:
-            self.config.moving_average_fraction = moving_average_fraction
-        if epsilon is not None:
-            assert epsilon >= 1e-5, "epsilon must be no less than 1e-5."
-            self.config.epsilon = epsilon
-
-        input_layer = self.get_input_layer(0)
-        image_conf = self.config.inputs[0].image_conf
-        if img3D:
-            parse_image3d(self.inputs[0].image, input_layer.name, image_conf)
-            # Only pass the width and height of input to batch_norm layer
-            # when either of it is non-zero.
-            if input_layer.width != 0 or input_layer.height != 0:
-                self.set_cnn_layer(
-                    input_layer_name=name,
-                    depth=image_conf.img_size_z,
-                    height=image_conf.img_size_y,
-                    width=image_conf.img_size,
-                    channels=image_conf.channels,
-                    is_print=True)
-            else:
-                self.set_layer_size(input_layer.size)
-        else:
-            parse_image(self.inputs[0].image, input_layer.name, image_conf)
-            # Only pass the width and height of input to batch_norm layer
-            # when either of it is non-zero.
-            if input_layer.width != 0 or input_layer.height != 0:
-                self.set_cnn_layer(
-                    input_layer_name=name,
-                    height=image_conf.img_size_y,
-                    width=image_conf.img_size,
-                    channels=image_conf.channels,
-                    is_print=True)
-            else:
-                self.set_layer_size(input_layer.size)
-
-        psize = self.calc_parameter_size(image_conf)
-        dims = [1, psize]
-        if mean_var_names is not None:
-            assert len(mean_var_names) == 2
-            self.inputs[1].parameter_name = mean_var_names[0]
-            self.inputs[2].parameter_name = mean_var_names[1]
-
-        self.create_input_parameter(0, psize)
-        self.create_input_parameter(1, psize, dims)
-        self.create_input_parameter(2, psize, dims)
-
-        self.create_bias_parameter(bias, psize)
-
-    def set_cnn_layer(self,
-                      input_layer_name,
-                      depth=None,
-                      height=None,
-                      width=None,
-                      channels=None,
-                      is_print=True):
-        depthIsNone = False
-        if depth is None:
-            depth = 1
-            depthIsNone = True
-        size = depth * height * width * channels
-        self.set_layer_size(size)
-        self.set_layer_height_width(height, width)
-        self.set_layer_depth(depth)
-        if is_print and depthIsNone:
-            print("output for %s: c = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, height, width, size))
-        elif is_print:
-            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
-                  (input_layer_name, channels, depth, height, width, size))
-
-    def calc_parameter_size(self, image_conf):
-        return image_conf.channels
-
-
-@config_layer('trans')
-class TransLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(TransLayer, self).__init__(
-            name, 'trans', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'TransLayer must have one and only one input')
-        self.set_layer_size(self.get_input_layer(0).size)
-
-
-@config_layer('resize')
-class ResizeLayer(LayerBase):
-    def __init__(self, name, size, inputs, **xargs):
-        super(ResizeLayer, self).__init__(
-            name, 'resize', size=size, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'ResizeLayer must have one and only one input')
-
-
-@config_layer('rotate')
-class RotateLayer(LayerBase):
-    def __init__(self, name, inputs, height, width, device=None):
-        super(RotateLayer, self).__init__(
-            name, 'rotate', 0, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 1,
-            'RotateLayer must have one and only one input')
-        self.set_layer_height_width(height, width)
-        self.set_layer_size(self.get_input_layer(0).size)
-
-
-@config_layer('blockexpand')
-class BlockExpandLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(BlockExpandLayer, self).__init__(
-            name, 'blockexpand', 0, inputs=inputs, **xargs)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            parse_block_expand(
-                self.inputs[input_index].block_expand, input_layer.name,
-                self.config.inputs[input_index].block_expand_conf)
-            block_expand_conf = self.config.inputs[
-                input_index].block_expand_conf
-            self.set_layer_size(block_expand_conf.block_x *
-                                block_expand_conf.block_y *
-                                block_expand_conf.channels)
-
-
-@config_layer('maxout')
-class MaxOutLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(MaxOutLayer, self).__init__(
-            name, 'maxout', 0, inputs=inputs, **xargs)
-        input_layer = self.get_input_layer(0)
-        maxout_conf = self.config.inputs[0].maxout_conf
-        parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
-        out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
-        self.set_cnn_layer(name, maxout_conf.image_conf.img_size_y,
-                           maxout_conf.image_conf.img_size, out_channels)
-
-
-@config_layer('row_conv')
-class RowConvLayer(LayerBase):
-    def __init__(self, name, inputs, context_length, **xargs):
-        super(RowConvLayer, self).__init__(
-            name, 'row_conv', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'row convolution layer must have one and only one input.')
-        input_layer = self.get_input_layer(0)
-        row_conv_conf = self.config.inputs[0].row_conv_conf
-        row_conv_conf.context_length = context_length
-        self.set_layer_size(input_layer.size)
-        psize = context_length * input_layer.size
-        dims = [context_length, input_layer.size]
-        self.create_input_parameter(0, psize, dims)
-
-
-@config_layer('clip')
-class ClipLayer(LayerBase):
-    def __init__(self, name, inputs, min, max, **xargs):
-        super(ClipLayer, self).__init__(name, 'clip', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'ClipLayer must have one and only one input.')
-        config_assert(min < max, 'min must be less than max.')
-        input_layer = self.get_input_layer(0)
-        self.set_layer_size(input_layer.size)
-        self.config.inputs[0].clip_conf.min = min
-        self.config.inputs[0].clip_conf.max = max
-
-
-@config_layer('scale_shift')
-class ScaleShiftLayer(LayerBase):
-    def __init__(self, name, inputs, bias=True, **xargs):
-        super(ScaleShiftLayer, self).__init__(
-            name, 'scale_shift', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'ScaleShiftLayer must have one and only one input.')
-        input_layer = self.get_input_layer(0)
-        self.set_layer_size(input_layer.size)
-        self.create_input_parameter(0, 1, [1, 1])
-        self.create_bias_parameter(bias, 1)
-
-
-# key: cost type
-# value: cost class
-g_cost_map = {}
-
-
-# define a cost layer without any parameters
-def define_cost(class_name, cost_type):
-    def init(cls, name, inputs, device=None, coeff=1.):
-        super(type(cls), cls).__init__(
-            name, cost_type, 1, inputs, device=device, coeff=coeff)
-
-    cls = type(class_name, (LayerBase, ), dict(__init__=init))
-    global g_cost_map
-    g_cost_map[cost_type] = cls
-
-
-define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
-define_cost('CrossEntropyOverBeamCostLayer', 'cross_entropy_over_beam')
-define_cost('RankingCost', 'rank-cost')
-define_cost('AucValidation', 'auc-validation')
-define_cost('PnpairValidation', 'pnpair-validation')
-define_cost('SumOfSquaresCostLayer', 'square_error')
-define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
-define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
-define_cost('HuberTwoClassification', 'huber_classification')
-define_cost('SumCost', 'sum_cost')
-define_cost('SmoothL1Cost', 'smooth_l1')
-
-
-@config_layer('hsigmoid')
-class HierarchicalSigmoidLayer(LayerBase):
-    def __init__(self, name, num_classes, inputs, device=None, bias=True):
-        super(HierarchicalSigmoidLayer, self).__init__(
-            name, 'hsigmoid', 1, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) >= 2,
-            'HierarchicalSigmoidLayer must have at least 2 inputs')
-        self.config.num_classes = num_classes
-        for input_index in xrange(len(self.inputs) - 1):
-            input_layer = self.get_input_layer(input_index)
-            psize = (num_classes - 1) * input_layer.size
-            dims = [num_classes - 1, input_layer.size]
-            self.create_input_parameter(input_index, psize, dims)
-        self.create_bias_parameter(bias, num_classes - 1)
-
-
-'''
-lambdaCost for lambdaRank LTR approach
-
-Usage:
-  Example: Layer(name = "cost", type = "lambda_cost", NDCG_num = 8,
-             max_sort_size = -1, inputs = ["output", "score"])
-
-  Input data: Samples of the same query should be loaded as a sequence,
-          by PyDataProvider etc.. User should provide
-          scores for each sample. The score slot should be the 2nd
-          input of lambdaRank layer.
-
-  NDCG_num = the size of NDCG, e.g., 5 for NDCG@5.
-    Note: NDCG_num must be less than or equal to the minimum
-          size of lists.
-
-  max_sort_size = the size of partial sorting in calculating gradient.
-    Note: If max_sort_size = -1, then for each list, the algorithm will
-          sort the entire list to get gradient.
-          In other cases, max_sort_size must be greater than or equal
-          to NDCG_num.
-          max_sort_size can be greater than the size of a list, in which
-          case the algorithm will sort the entire list to get gradient.
-'''
-
-
-@config_layer('lambda_cost')
-class LambdaCost(LayerBase):
-    def __init__(self, name, inputs, NDCG_num=5, max_sort_size=-1, device=None):
-        super(LambdaCost, self).__init__(
-            name, 'lambda_cost', 1, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 2, 'lambdaCost must have 2 inputs')
-        self.config.NDCG_num = NDCG_num
-        if max_sort_size != -1:
-            config_assert(
-                NDCG_num <= max_sort_size,
-                'NDCG_num must be less than or equal to max_sort_size')
-        self.config.max_sort_size = max_sort_size
-
-
-@config_layer('huber_regression')
-class HuberRegressionLoss(LayerBase):
-    def __init__(self, name, inputs, delta=1., coeff=1., device=None):
-        super(HuberRegressionLoss, self).__init__(
-            name, 'huber_regression', 1, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 2, 'HuberRegression must have 2 inputs')
-        self.config.delta = delta
-        self.config.coeff = coeff
-
-
-@config_layer('nce')
-class NCELayer(LayerBase):
-    def __init__(self,
-                 name,
-                 num_classes,
-                 inputs,
-                 num_neg_samples=10,
-                 neg_sampling_dist=None,
-                 bias=True,
-                 **xargs):
-        super(NCELayer, self).__init__(name, 'nce', 1, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) >= 2, 'NCELayer must have at least 2 inputs')
-        self.config.num_classes = num_classes
-        if neg_sampling_dist is not None:
-            config_assert(
-                len(neg_sampling_dist) == num_classes,
-                'len(neg_sampling_dist)(%s) is not same as num_classes (%s)' %
-                (len(neg_sampling_dist), num_classes))
-            s = sum(neg_sampling_dist)
-            config_assert(
-                abs(s - 1) < 1e-5,
-                'The sum of neg_sampling_dist (%s) is not 1' % s)
-
-            self.config.neg_sampling_dist.extend(neg_sampling_dist)
-
-        self.config.num_neg_samples = num_neg_samples
-        num_real_inputs = len(self.inputs) - 1
-        input_layer = self.get_input_layer(num_real_inputs)
-        config_assert(input_layer.type == 'data',
-                      'Expecting the last input layer of an nce layer to be '
-                      'a data layer')
-
-        if (num_real_inputs > 1 and input_layer.size == 1 and
-                self.get_input_layer(num_real_inputs - 1).type == 'data'):
-            # This input layer is assumed to be a sample weight layer
-            num_real_inputs -= 1
-
-        for input_index in xrange(num_real_inputs):
-            input_layer = self.get_input_layer(input_index)
-            psize = num_classes * input_layer.size
-            dims = [num_classes, input_layer.size]
-            self.create_input_parameter(input_index, psize, dims)
-        self.create_bias_parameter(bias, num_classes)
-
-
-@config_layer('addto')
-class AddToLayer(LayerBase):
-    layer_type = 'addto'
-
-    def __init__(self, name, inputs, bias=True, **xargs):
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        if self.layer_type == "mkldnn_addto":
-            config_assert(use_mkldnn, "mkldnn_addto only support MKLDNN")
-        self.layer_type = 'mkldnn_addto' if use_mkldnn else 'addto'
-        super(AddToLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-        config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
-
-        layer_size = self.get_input_layer(0).size
-        # To reserve heght, width, depth.
-        layer_with_hwc = self.get_input_layer(0)
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            assert layer_size == input_layer.size
-            if input_layer.height and input_layer.height and input_layer.height:
-                layer_with_hwc = input_layer
-
-        self.set_layer_size(layer_with_hwc.size)
-        self.set_layer_height_width(layer_with_hwc.height, layer_with_hwc.width)
-        self.set_layer_depth(layer_with_hwc.depth)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('mkldnn_addto')
-class MKLDNNAddtoLayer(AddToLayer):
-    layer_type = 'mkldnn_addto'
-
-
-@config_layer('agent')
-class AgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(AgentLayer, self).__init__(
-            name, 'agent', size, inputs=[], device=device)
-
-
-@config_layer('gather_agent')
-class GatherAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(GatherAgentLayer, self).__init__(
-            name, 'gather_agent', size, inputs=[], device=device)
-
-
-@config_layer('scatter_agent')
-class ScatterAgentLayer(LayerBase):
-    def __init__(self, name, size, width=None, height=None, device=None):
-        super(ScatterAgentLayer, self).__init__(
-            name, 'scatter_agent', size, inputs=[], device=device)
-        if height and width:
-            self.set_layer_height_width(height, width)
-
-
-@config_layer('multiplex')
-class MultiplexLayer(LayerBase):
-    def __init__(self, name, inputs, size, device=None):
-        super(MultiplexLayer, self).__init__(
-            name, 'multiplex', size, inputs=inputs, device=device)
-        config_assert(
-            len(inputs) > 2, 'MultiplexLayer should have more than 2 inputs.')
-        for i in range(1, len(inputs)):
-            config_assert(
-                self.get_input_layer(i).size == size,
-                "All the input layers except the first one should"
-                "have the same size as the MultiplexLayer.")
-
-
-@config_func
-def Link(name, has_subseq=False):
-    """
-    Still keeping has_subseq for backward compatibility
-    """
-    link_config = LinkConfig()
-    link_config.link_name = name
-    return link_config
-
-
-# memory for recurrent layer group.
-# *name* and *size* are actual layer's name and size.
-# If *name* is None, need to provide *memory_name* and need to use
-# SetMemoryInput() later to specify the layer which this memory remembers.
-#
-# return the name of the memory,
-# use this name if you assign the memory as other layer's input
-#
-# boot frame of memory is zeroed by default,
-# or initialize by boot layer output if *boot_layer* set,
-# or initialize by trainable bias if *boot_bias* set,
-# or initialize by a constant id if *boot_with_const_id* set
-#
-# Memory can be a sequence if *is_sequence* set, this type of memory
-# can only be initailized by a *boot_layer* which is a sequence.
-#
-@config_func
-def Memory(name,
-           size,
-           is_sequence=False,
-           boot_layer=None,
-           boot_bias=False,
-           boot_bias_active_type="",
-           boot_with_const_id=None,
-           memory_name=None):
-    if not memory_name:
-        config_assert(name is not None, "name needs cannot be None")
-        memory_name = name + "+delay1"
-    agent_name = memory_name
-    agent_layer = AgentLayer(agent_name, size)
-    config_assert(g_current_submodel.is_recurrent_layer_group,
-                  'Memory should be used in recurrent layer group only')
-    memory = g_current_submodel.memories.add()
-    if name is not None:
-        memory.layer_name = MakeLayerNameInSubmodel(name)
-    memory.link_name = MakeLayerNameInSubmodel(agent_name)
-    options = sum((boot_layer is not None, bool(boot_bias),
-                   boot_with_const_id is not None))
-    config_assert(
-        options <= 1,
-        'take one option at most from boot_layer, boot_bias, or boot_with_const_id'
-    )
-    if boot_layer is not None:
-        boot_layer = MakeLayerNameInParentSubmodel(boot_layer)
-        config_assert(boot_layer in g_layer_map,
-                      'boot_layer "%s" does not correspond to a layer name' %
-                      boot_layer)
-        memory.boot_layer_name = boot_layer
-    elif boot_bias:
-        memory.boot_bias_parameter_name = agent_layer.create_bias_parameter(
-            boot_bias, size, for_self=False)
-        memory.boot_bias_active_type = boot_bias_active_type
-    elif boot_with_const_id is not None:
-        memory.boot_with_const_id = boot_with_const_id
-    return agent_name
-
-
-@config_func
-def SetMemoryInput(memory_name, layer_name):
-    memory_name = MakeLayerNameInSubmodel(memory_name)
-    layer_name = MakeLayerNameInSubmodel(layer_name)
-    for mem in g_current_submodel.memories:
-        if mem.link_name == memory_name:
-            mem.layer_name = layer_name
-            return
-    logger.fatal("Nonexistent memory name: " + memory_name)
-
-
-# Generator for recurrent layer group, to use it:
-#  1. define a id layer as output of layer group
-#  2. define a memory of this id layer, and assign a boot id(begin of sequence)
-#  3. define a eos check layer and fill its name in generator's *eos_layer_name*
-# Sequence generation will stop when eos check return 1 or *max_num_frames* reached.
-# If *beam_size* is greater than one, generator will use beam search.
-#   in beam search, if *num_results_per_sample* set, one sample sequence can output
-#   multiple results each with a probility.
-@config_func
-def Generator(
-        max_num_frames,
-        eos_layer_name="eos_check",
-        num_results_per_sample=1,
-        beam_size=1,
-        log_prob=None, ):
-    generator_config = GeneratorConfig()
-    generator_config.max_num_frames = max_num_frames
-    generator_config.eos_layer_name = eos_layer_name
-    generator_config.num_results_per_sample = num_results_per_sample
-    generator_config.beam_size = beam_size
-    if log_prob is not None:
-        generator_config.log_prob = log_prob
-    return generator_config
-
-
-@config_layer('expand')
-class ExpandLayer(LayerBase):
-    def __init__(self, name, inputs, trans_type='non-seq', bias=False, **xargs):
-        super(ExpandLayer, self).__init__(
-            name, 'expand', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 2, 'ExpandLayer takes 2 and only 2 inputs')
-        self.config.trans_type = trans_type
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-        self.set_layer_size(self.get_input_layer(0).size)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('featmap_expand')
-class FeatMapExpandLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 num_filters=None,
-                 as_row_vector=True,
-                 bias=False,
-                 **xargs):
-        super(FeatMapExpandLayer, self).__init__(
-            name, 'featmap_expand', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1, 'ExpandLayer takes 1 and only 1 inputs')
-        if num_filters is not None:
-            self.config.num_filters = num_filters
-        else:
-            logger.fatal("FeatMapExpandLayer must specify num_filters.")
-        if not as_row_vector:
-            self.config.user_arg = "as_col_vec"
-        self.set_layer_size(self.get_input_layer(0).size * num_filters)
-
-
-@config_layer('max')
-class MaxLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 trans_type='non-seq',
-                 bias=False,
-                 output_max_index=None,
-                 stride=-1,
-                 **xargs):
-        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
-        config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
-        if trans_type == 'seq':
-            config_assert(stride == -1, 'subseq does not support stride window')
-        self.config.trans_type = trans_type
-        self.config.seq_pool_stride = stride
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-        self.create_bias_parameter(bias, self.config.size)
-        if output_max_index is not None:
-            self.config.output_max_index = output_max_index
-
-
-@config_layer('maxid')
-class MaxIdLayer(LayerBase):
-    def __init__(self, name, inputs, beam_size=None, device=None):
-        super(MaxIdLayer, self).__init__(
-            name, 'maxid', 0, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 1, 'MaxIdLayer must have 1 input')
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-
-        if beam_size is None:
-            global g_current_submodel
-            if g_current_submodel.HasField("generator"):
-                self.config.beam_size = g_current_submodel.generator.beam_size
-        else:
-            self.config.beam_size = beam_size
-
-
-@config_layer('eos_id')
-class EosIdLayer(LayerBase):
-    def __init__(self, name, inputs, eos_id, device=None):
-        super(EosIdLayer, self).__init__(
-            name, 'eos_id', 0, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 1, 'EosIdLayer must have 1 input')
-        self.set_layer_size(2)  # boolean output
-        self.config.eos_id = eos_id
-
-
-@config_layer('seqlastins')
-class SequenceLastInstanceLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 trans_type='non-seq',
-                 bias=False,
-                 stride=-1,
-                 **xargs):
-        super(SequenceLastInstanceLayer, self).__init__(
-            name, 'seqlastins', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
-        if trans_type == 'seq':
-            config_assert(stride == -1, 'subseq does not support stride window')
-        self.config.trans_type = trans_type
-        self.config.seq_pool_stride = stride
-        self.set_layer_size(self.get_input_layer(0).size)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('seqfirstins')
-class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
-    def __init__(self,
-                 name,
-                 inputs,
-                 trans_type='non-seq',
-                 bias=False,
-                 stride=-1,
-                 **xargs):
-        super(SequenceFirstInstanceLayer, self).__init__(
-            name,
-            inputs=inputs,
-            trans_type=trans_type,
-            bias=bias,
-            stride=stride,
-            **xargs)
-        self.config.select_first = True
-
-
-@config_layer('seqconcat')
-class SequenceConcatLayer(LayerBase):
-    def __init__(self, name, inputs, bias=False, **xargs):
-        super(SequenceConcatLayer, self).__init__(
-            name, 'seqconcat', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('seqreshape')
-class SequenceReshapeLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=False, **xargs):
-        super(SequenceReshapeLayer, self).__init__(
-            name, 'seqreshape', size, inputs=inputs, **xargs)
-        config_assert(
-            len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
-        self.set_layer_size(size)
-        self.create_bias_parameter(bias, size)
-
-
-@config_layer('subseq')
-class SubSequenceLayer(LayerBase):
-    def __init__(self, name, inputs, bias=False, **xargs):
-        super(SubSequenceLayer, self).__init__(
-            name, 'subseq', 0, inputs=inputs, **xargs)
-        config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
-        input_layer0 = self.get_input_layer(0)
-        size = input_layer0.size
-        self.set_layer_size(size)
-        self.create_bias_parameter(bias, size)
-
-
-@config_layer('seq_slice')
-class SeqSliceLayer(LayerBase):
-    def __init__(self, name, inputs, starts, ends, bias=False, **xargs):
-        if isinstance(inputs, list):
-            assert len(inputs) == 1, ('the first input of sequence slice layer '
-                                      'is a single sequence input.')
-        else:
-            inputs = [inputs]
-
-        if starts is not None:
-            if isinstance(starts, list):
-                assert len(starts) == 1, (
-                    'the start indices for sequence slice layer cannot '
-                    'be a list having more than one element.')
-                starts = starts[0]
-            inputs.append(starts)
-
-        if ends is not None:
-            if isinstance(ends, list):
-                assert len(ends) == 1, (
-                    'the end indices for sequence slice layer cannot '
-                    'be a list having more than one element.')
-                ends = ends[0]
-            inputs.append(ends)
-        assert len(inputs) >= 2, (
-            'the sequence slice layer has at least two inputs.')
-
-        super(SeqSliceLayer, self).__init__(
-            name, 'seq_slice', 0, inputs=inputs, **xargs)
-
-        input_layer0 = self.get_input_layer(0)
-        size = input_layer0.size
-        self.set_layer_size(size)
-
-        if len(inputs) == 3:
-            assert (
-                self.get_input_layer(1).size == self.get_input_layer(2).size), (
-                    'If start and end indices are both given to'
-                    'sequence slice layer, they should have the same width.')
-        elif len(inputs) == 2:
-            self.config.select_first = (starts is not None)
-
-
-@config_layer('sub_nested_seq')
-class SubNestedSequenceLayer(LayerBase):
-    def __init__(self, name, inputs, selected_indices, bias=False, **xargs):
-        if isinstance(inputs, list):
-            assert len(inputs) == 1, ('the first input of sub_nested_seq '
-                                      'layer is a single nested sequence.')
-            inputs = inputs[0]
-        if isinstance(selected_indices, list):
-            assert len(selected_indices) == 1, (
-                'the second input of '
-                'sub_nested_seq layer is a single layer which is a '
-                'set of selected indices.')
-            selected_indices = selected_indices[0]
-
-        super(SubNestedSequenceLayer, self).__init__(
-            name,
-            'sub_nested_seq',
-            0,
-            inputs=[inputs, selected_indices],
-            **xargs)
-        input_layer0 = self.get_input_layer(0)
-        size = input_layer0.size
-        self.set_layer_size(size)
-
-
-@config_layer('dot_prod')
-class DotProdLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(DotProdLayer, self).__init__(
-            name, 'dot_prod', 0, inputs, device=device)
-        config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.')
-        config_assert(
-            self.get_input_layer(0).size == self.get_input_layer(1).size,
-            "Two inputs should have the same size.")
-        self.set_layer_size(1)
-
-
-@config_layer('out_prod')
-class OuterProdLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(OuterProdLayer, self).__init__(
-            name, 'out_prod', 0, inputs=inputs, device=device)
-        config_assert(len(inputs) == 2, 'OuterProdLayer must have 2 inputs')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        self.set_layer_size(input_layer0.size * input_layer1.size)
-
-
-@config_layer('power')
-class PowerLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(PowerLayer, self).__init__(
-            name, 'power', 0, inputs=inputs, device=device)
-        config_assert(len(inputs) == 2, 'PowerLayer must have 2 inputs')
-        input_layer1 = self.get_input_layer(1)
-        self.set_layer_size(input_layer1.size)
-        input_layer0 = self.get_input_layer(0)
-        config_assert(1 == input_layer0.size,
-                      'The left input is the exponent and should be of size 1')
-
-
-@config_layer('slope_intercept')
-class SlopeInterceptLayer(LayerBase):
-    def __init__(self, name, inputs, slope=1.0, intercept=0.0, device=None):
-        super(SlopeInterceptLayer, self).__init__(
-            name, 'slope_intercept', 0, inputs=inputs, device=device)
-        self.config.slope = slope
-        self.config.intercept = intercept
-        config_assert(len(inputs) == 1, 'SlopeInterceptLayer must have 1 input')
-        input_layer0 = self.get_input_layer(0)
-        self.set_layer_size(input_layer0.size)
-
-
-@config_layer('scaling')
-class ScalingLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(ScalingLayer, self).__init__(
-            name, 'scaling', 0, inputs=inputs, device=device)
-        config_assert(len(inputs) == 2, 'ScalingLayer must have 2 inputs')
-        input_layer1 = self.get_input_layer(1)
-        self.set_layer_size(input_layer1.size)
-        input_layer0 = self.get_input_layer(0)
-        config_assert(1 == input_layer0.size,
-                      'The left input should be of size 1')
-
-
-@config_layer('conv_shift')
-class ConvShiftLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(ConvShiftLayer, self).__init__(
-            name, 'conv_shift', 0, inputs=inputs, device=device)
-        config_assert(len(inputs) == 2, 'ConvShiftLayer must have 2 inputs')
-        input_layer0 = self.get_input_layer(0)
-        self.set_layer_size(input_layer0.size)
-
-
-@config_layer('convex_comb')
-class ConvexCombinationLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None):
-        super(ConvexCombinationLayer, self).__init__(
-            name, 'convex_comb', size, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 2, 'ConvexCombinationLayer must have 2 inputs')
-        config_assert(
-            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'Wrong input size for ConvexCombinationLayer')
-        self.set_layer_size(size)
-
-
-@config_layer('interpolation')
-class InterpolationLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(InterpolationLayer, self).__init__(
-            name, 'interpolation', 0, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 3, 'InterpolationLayer must have 3 inputs')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        input_layer2 = self.get_input_layer(2)
-        self.set_layer_size(input_layer1.size)
-        config_assert(input_layer0.size == 1, 'weight should be of size 1')
-        config_assert(input_layer1.size == input_layer2.size,
-                      'the two vector inputs should be of the same size')
-
-
-@config_layer('bilinear_interp')
-class BilinearInterpLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(BilinearInterpLayer, self).__init__(
-            name, 'bilinear_interp', 0, inputs=inputs, **xargs)
-        input_layer = self.get_input_layer(0)
-        conf = self.config.inputs[0].bilinear_interp_conf
-        parse_bilinear(self.inputs[0].bilinear_interp, input_layer.name, conf)
-        self.set_cnn_layer(name, conf.out_size_y, conf.out_size_x,
-                           conf.image_conf.channels)
-
-
-@config_layer('sum_to_one_norm')
-class SumToOneNormLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(SumToOneNormLayer, self).__init__(
-            name, 'sum_to_one_norm', 0, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 1, 'SumToOneNormLayer must have 1 input')
-        input_layer0 = self.get_input_layer(0)
-        self.set_layer_size(input_layer0.size)
-
-
-@config_layer('row_l2_norm')
-class RowL2NormLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
-        super(RowL2NormLayer, self).__init__(
-            name, 'row_l2_norm', 0, inputs=inputs, **xargs)
-        config_assert(len(self.inputs) == 1, 'RowL2NormLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        self.set_layer_size(input_layer.size)
-
-
-@config_layer('cos')
-class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=1, device=None):
-        super(CosSimLayer, self).__init__(
-            name, 'cos', 1, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 2,
-            'The CosSimLayer expects two and only two inputs.')
-        config_assert(
-            self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'The two inputs of CosSimLayer must have the same dimensionality.')
-        self.config.cos_scale = cos_scale
-
-
-@config_layer('cos_vm')
-class CosSimVecMatLayer(LayerBase):
-    def __init__(self, name, size, inputs, cos_scale=1.0, device=None):
-        super(CosSimVecMatLayer, self).__init__(
-            name, 'cos_vm', size, inputs=inputs, device=device)
-        self.config.cos_scale = cos_scale
-        config_assert(
-            len(self.inputs) == 2, 'The CosSimVecMatLayer must have 2 inputs.')
-        config_assert(
-            size * self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'Wrong input size for CosSimVecMatLayer.')
-
-
-@config_layer('l2_distance')
-class L2DistanceLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(L2DistanceLayer, self).__init__(
-            name, 'l2_distance', 1, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 2, ('The L2DistanceLayer must have '
-                                    'and only have 2 inputs.'))
-        config_assert(
-            self.get_input_layer(0).size == self.get_input_layer(1).size,
-            ('Two inputs of the L2DistanceLayer must have '
-             'the same dimensionality.'))
-
-
-@config_layer('sampling_id')
-class SamplingIdLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(SamplingIdLayer, self).__init__(
-            name, 'sampling_id', 0, inputs=inputs, device=device)
-        config_assert(
-            len(self.inputs) == 1, 'SamplingIdLayer must have 1 input')
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-
-
-# AverageLayer: "average" for each sample within a sequence.
-# average_stratrgy: set to one of the following:
-# 'average': plain average.
-# 'sum': sum each sample instead of average (which is divide by sample_num).
-# 'squarerootn': sum each sample, but divide by sqrt(sample_num).
-@config_layer('average')
-class AverageLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 average_strategy='average',
-                 trans_type='non-seq',
-                 bias=False,
-                 stride=-1,
-                 **xargs):
-        super(AverageLayer, self).__init__(
-            name, 'average', 0, inputs=inputs, **xargs)
-        self.config.average_strategy = average_strategy
-        if trans_type == 'seq':
-            config_assert(stride == -1, 'subseq does not support stride window')
-        self.config.trans_type = trans_type
-        self.config.seq_pool_stride = stride
-        config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('tensor')
-class TensorLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=True, **xargs):
-        super(TensorLayer, self).__init__(
-            name, 'tensor', size, inputs=inputs, **xargs)
-        config_assert(len(self.inputs) == 2, 'TensorLayer must have 2 inputs')
-        config_assert(size > 0, 'size must be positive')
-        config_assert(inputs[1].parameter_name == None,
-                      'second parameter should be None.')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        psize = size * input_layer0.size * input_layer1.size
-        dims = [input_layer0.size, input_layer1.size, size]
-        self.create_input_parameter(0, psize, dims)
-        self.create_bias_parameter(bias, size)
-
-
-@config_layer('mixed')
-class MixedLayer(LayerBase):
-    def __init__(self, name, inputs, size=0, bias=True, **xargs):
-        config_assert(inputs, 'inputs cannot be empty')
-        super(MixedLayer, self).__init__(
-            name, 'mixed', size, inputs=inputs, **xargs)
-        operator_input_index = []
-        for operator in self.operators:
-            operator_conf = operator.operator_conf
-            for i in xrange(1, len(operator.input_layer_names)):
-                input_index = len(self.config.inputs)
-                operator_conf.input_indices.append(input_index)
-                input_config = Input(operator.input_layer_names[i])
-                self.inputs.append(input_config)
-                layer_input = self.config.inputs.add()
-                layer_input.input_layer_name = input_config.input_layer_name
-            for input_index in operator_conf.input_indices:
-                input_layer = self.get_input_layer(input_index)
-                operator_conf.input_sizes.append(input_layer.size)
-                operator_input_index.append(input_index)
-            if self.config.size == 0:
-                size = operator.calc_output_size(operator_conf.input_sizes)
-                if size != 0:
-                    self.set_layer_size(size)
-            else:
-                sz = operator.calc_output_size(operator_conf.input_sizes)
-                if sz != 0:
-                    config_assert(
-                        sz == self.config.size,
-                        "different inputs have different size: %s vs. %s" %
-                        (sz, self.config.size))
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            input = self.inputs[input_index]
-            if input_index not in operator_input_index:
-                config_assert(
-                    isinstance(input, Projection),
-                    "input should be projection or operation")
-            if self.config.size == 0 and isinstance(input, Projection):
-                size = input.calc_output_size(input_layer)
-                if size != 0:
-                    self.set_layer_size(size)
-            elif isinstance(input, Projection):
-                sz = input.calc_output_size(input_layer)
-                if sz != 0:
-                    config_assert(
-                        sz == self.config.size,
-                        "different inputs have different size: %s vs. %s" %
-                        (sz, self.config.size))
-        config_assert(size != 0, "size is not set")
-
-        for input_index in xrange(len(self.inputs)):
-            input = self.inputs[input_index]
-            if isinstance(input, Projection):
-                input_layer = self.get_input_layer(input_index)
-                input.proj_conf.input_size = input_layer.size
-                input.proj_conf.output_size = size
-
-                input_config = self.config.inputs[input_index]
-                input_config.proj_conf.CopyFrom(input.proj_conf)
-                input_config.proj_conf.name = gen_parameter_name(name,
-                                                                 input_index)
-                psize = input.calc_parameter_size(input_layer.size, size)
-                dims = input.calc_parameter_dims(input_layer.size, size)
-                self.create_input_parameter(input_index, psize, dims)
-
-        for operator in self.operators:
-            operator_conf = operator.operator_conf
-            operator_conf.output_size = self.config.size
-            operator.check_dims()
-            record_operator_conf = self.config.operator_confs.add()
-            record_operator_conf.CopyFrom(operator_conf)
-
-        psize = self.config.size
-        if isinstance(self.inputs[0], ConvProjection):
-            self.config.shared_biases = True
-            psize = 0
-            for input in self.inputs:
-                psize += input.calc_bias_size()
-
-        if bias:
-            self.config.bias_size = psize
-            self.create_bias_parameter(bias, psize)
-
-
-# like MixedLayer, but no bias parameter
-@config_func
-def ExpressionLayer(name, inputs, **xargs):
-    MixedLayer(name, inputs, bias=False, **xargs)
-
-
-@config_layer('concat')
-class ConcatenateLayer(LayerBase):
-    layer_type = 'concat'
-
-    def __init__(self, name, inputs, bias=False, **xargs):
-        config_assert(inputs, 'inputs cannot be empty')
-        config_assert(not bias, 'ConcatenateLayer cannot support bias.')
-        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-        if self.layer_type == "mkldnn_concat":
-            config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN")
-        self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat'
-        super(ConcatenateLayer, self).__init__(
-            name, self.layer_type, 0, inputs=inputs, **xargs)
-        size = 0
-        for input_index in xrange(len(self.inputs)):
-            assert self.get_input_layer(0).height == self.get_input_layer(
-                input_index).height
-            assert self.get_input_layer(0).width == self.get_input_layer(
-                input_index).width
-            assert self.get_input_layer(0).depth == self.get_input_layer(
-                input_index).depth
-            input_layer = self.get_input_layer(input_index)
-            input = self.inputs[input_index]
-            if self.config.size == 0:
-                size += input_layer.size
-
-        self.set_layer_height_width(self.get_input_layer(0).height, \
-                                    self.get_input_layer(0).width)
-        self.set_layer_depth(self.get_input_layer(0).depth)
-        self.set_layer_size(size)
-
-
-@config_layer('mkldnn_concat')
-class MKLDNNConcatLayer(ConcatenateLayer):
-    layer_type = 'mkldnn_concat'
-
-
-# like concat layer, but each input layer was processed by a Projection.
-@config_layer('concat2')
-class ConcatenateLayer2(LayerBase):
-    def __init__(self, name, inputs, bias=False, **xargs):
-        config_assert(inputs, 'inputs cannot be empty')
-        super(ConcatenateLayer2, self).__init__(
-            name, 'concat2', 0, inputs=inputs, **xargs)
-
-        if isinstance(self.inputs[0], ConvProjection):
-            for input_index in xrange(len(self.inputs) - 1):
-                input = self.inputs[input_index + 1]
-                config_assert(
-                    isinstance(input, ConvProjection),
-                    "The first input of ConcatenateLayer2 is ConvProjection, "
-                    "the other inputs should also be ConvProjection.")
-
-        size = 0
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            input = self.inputs[input_index]
-            output_size = input.calc_output_size(input_layer)
-            config_assert(output_size != 0, "proj output size is not set")
-            size += output_size
-
-        self.set_layer_size(size)
-
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            input = self.inputs[input_index]
-            input.proj_conf.input_size = input_layer.size
-            input.proj_conf.output_size = input.calc_output_size(input_layer)
-
-            input_config = self.config.inputs[input_index]
-            input_config.proj_conf.CopyFrom(input.proj_conf)
-            input_config.proj_conf.name = gen_parameter_name(name, input_index)
-            psize = input.calc_parameter_size(input.proj_conf.input_size,
-                                              input.proj_conf.output_size)
-            dims = input.calc_parameter_dims(input.proj_conf.input_size,
-                                             input.proj_conf.output_size)
-            self.create_input_parameter(input_index, psize, dims)
-
-        psize = self.config.size
-        if isinstance(self.inputs[0], ConvProjection):
-            self.config.shared_biases = True
-            psize = 0
-            for input in self.inputs:
-                psize += input.calc_bias_size()
-
-        if bias:
-            self.config.bias_size = psize
-            self.create_bias_parameter(bias, psize)
-
-
-@config_layer('recurrent')
-class RecurrentLayer(LayerBase):
-    layer_type = 'recurrent'
-
-    def __init__(self, name, inputs, reversed=False, bias=True, **xargs):
-        use_mkl_packed = bool(
-            int(g_command_config_args.get("use_mkl_packed", 0)))
-        self.layer_type = 'mkl_packed_recurrent' if use_mkl_packed else 'recurrent'
-        super(RecurrentLayer, self).__init__(name, self.layer_type, 0, inputs,
-                                             **xargs)
-        config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        size = input_layer.size
-        self.set_layer_size(size)
-        self.config.reversed = reversed
-        dims = [size, size]
-        self.create_input_parameter(0, size * size, dims)
-        self.create_bias_parameter(bias, self.config.size)
-
-
-@config_layer('lstmemory')
-class LstmLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 reversed=False,
-                 active_gate_type="sigmoid",
-                 active_state_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(LstmLayer, self).__init__(name, 'lstmemory', 0, inputs, **xargs)
-        config_assert(len(self.inputs) == 1, 'LstmLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        #check input_layer.size is divided by 4
-        config_assert(input_layer.size % 4 == 0, "size % 4 should be 0!")
-        size = input_layer.size / 4
-        self.set_layer_size(size)
-        self.config.reversed = reversed
-        self.config.active_gate_type = active_gate_type
-        self.config.active_state_type = active_state_type
-        self.create_input_parameter(0, size * size * 4, [size, size, 4])
-        #bias includes 3 kinds of peephole, 4 + 3 = 7
-        self.create_bias_parameter(bias, size * 7)
-
-
-@config_layer('lstm_step')
-class LstmStepLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 active_gate_type="sigmoid",
-                 active_state_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(LstmStepLayer, self).__init__(name, 'lstm_step', size, inputs,
-                                            **xargs)
-        config_assert(len(inputs) == 2, 'LstmStepLayer must have 2 inputs')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        config_assert(input_layer0.size == 4 * size,
-                      'input_layer0.size != 4 * layer.size')
-        config_assert(input_layer1.size == size,
-                      'input_layer1.size != layer.size')
-        self.config.active_gate_type = active_gate_type
-        self.config.active_state_type = active_state_type
-        self.create_bias_parameter(bias, size * 3)
-
-
-# get the specific output from the input layer.
-@config_layer('get_output')
-class GetOutputLayer(LayerBase):
-    def __init__(self, name, size, inputs):
-        super(GetOutputLayer, self).__init__(name, 'get_output', size, inputs)
-        config_assert(
-            len(self.inputs) == 1, 'GetOutputLayer must have 1 inputs')
-        inputs = self.inputs[0]
-        config_assert(inputs.input_layer_argument,
-                      'input_layer_argument cannot be empty')
-
-
-@config_layer('mdlstmemory')
-class MDLstmLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 directions=True,
-                 active_gate_type="sigmoid",
-                 active_state_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(MDLstmLayer, self).__init__(name, 'mdlstmemory', 0, inputs,
-                                          **xargs)
-        config_assert(len(self.inputs) == 1, 'MDLstmLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        dim_num = len(directions)
-        #check input_layer.size is divided by (3+dim_num)
-        config_assert(input_layer.size % (3 + dim_num) == 0,
-                      "size % (dim_num) should be 0!")
-        size = input_layer.size / (3 + dim_num)
-        self.set_layer_size(size)
-        self.config.active_gate_type = active_gate_type
-        self.config.active_state_type = active_state_type
-        for i in xrange(len(directions)):
-            self.config.directions.append(int(directions[i]))
-        self.create_input_parameter(0, size * size * (3 + dim_num),
-                                    [size, size, 3 + dim_num])
-        #bias includes 3 kinds of peephole, 3+dim_num+2+dim_num
-        self.create_bias_parameter(bias, size * (5 + 2 * dim_num))
-
-
-@config_layer('gated_recurrent')
-class GatedRecurrentLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 reversed=False,
-                 active_gate_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(GatedRecurrentLayer, self).__init__(name, 'gated_recurrent', 0,
-                                                  inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1, 'GatedRecurrentLayer must have 1 input')
-        input_layer = self.get_input_layer(0)
-        #check input_layer.size is divided by 3
-        config_assert(input_layer.size % 3 == 0, "size % 3 should be 0!")
-        size = input_layer.size / 3
-        self.set_layer_size(size)
-        self.config.reversed = reversed
-        self.config.active_gate_type = active_gate_type
-        self.create_input_parameter(0, size * size * 3, [size, size * 3])
-        self.create_bias_parameter(bias, size * 3)
-
-
-@config_layer('gru_step')
-class GruStepLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 active_gate_type="sigmoid",
-                 bias=True,
-                 **xargs):
-        super(GruStepLayer, self).__init__(name, 'gru_step', size, inputs,
-                                           **xargs)
-        config_assert(len(self.inputs) == 2, 'GruStepLayer must have 2 input')
-        input_layer0 = self.get_input_layer(0)
-        input_layer1 = self.get_input_layer(1)
-        config_assert(input_layer0.size == 3 * size,
-                      'input_layer0.size != 3 * layer.size')
-        config_assert(input_layer1.size == size,
-                      'input_layer1.size != layer.size')
-        self.config.active_gate_type = active_gate_type
-        self.create_input_parameter(0, size * size * 3, [size, size * 3])
-        self.create_bias_parameter(bias, size * 3)
-
-
-'''
- A layer for calculating the cost of sequential conditional random field model.
- Example: CRFLayer(name="crf_cost", size=label_num,
-                   inputs=["output", "label", "weight"])
-          where "weight" is optional, one weight for each sequence
- @param coeff: weight of the layer
-'''
-
-
-@config_layer('crf')
-class CRFLayer(LayerBase):
-    def __init__(self, name, size, inputs, coeff=1.0, device=None):
-        super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device)
-        config_assert(2 <= len(self.inputs) <= 3,
-                      'CRFLayer must have 2 or 3 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
-        self.config.coeff = coeff
-
-
-'''
- A layer for calculating the decoding sequence of sequential conditional
- random field model.
- The decoding sequence is stored in output_.ids
- If a second input is provided, it is treated as the ground-truth label, and
- this layer will also calculate error, output_.value[i] is 1 for incorrect
- decoding or 0 for correct decoding
-'''
-
-
-@config_layer('crf_decoding')
-class CRFDecodingLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None):
-        super(CRFDecodingLayer, self).__init__(
-            name, 'crf_decoding', size, inputs, device=device)
-        config_assert(
-            len(self.inputs) <= 2,
-            'CRFDecodingLayer cannot have more than 2 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
-
-
-@config_layer('ctc')
-class CTCLayer(LayerBase):
-    def __init__(self, name, size, inputs, norm_by_times=False, device=None):
-        super(CTCLayer, self).__init__(name, 'ctc', size, inputs, device=device)
-        self.config.norm_by_times = norm_by_times
-        config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
-
-
-@config_layer('kmax_seq_score')
-class KmaxSeqScoreLayer(LayerBase):
-    def __init__(self, name, inputs, beam_size, **xargs):
-        super(KmaxSeqScoreLayer, self).__init__(
-            name, 'kmax_seq_score', 0, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1, 'KmaxSeqScoreLayer has only one input.')
-        self.config.beam_size = beam_size
-
-
-@config_layer('warp_ctc')
-class WarpCTCLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 blank=0,
-                 norm_by_times=False,
-                 device=None):
-        super(WarpCTCLayer, self).__init__(
-            name, 'warp_ctc', size=size, inputs=inputs, device=device)
-        self.config.blank = blank
-        self.config.norm_by_times = norm_by_times
-        config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs')
-        input_layer = self.get_input_layer(0)
-        config_assert(
-            (input_layer.active_type == '' or
-             input_layer.active_type == 'linear'),
-            "Expecting the active_type of input layer to be linear or null")
-
-
-@config_layer('recurrent_layer_group')
-class RecurrentLayerGroup(LayerBase):
-    def __init__(self, name, device=None):
-        super(RecurrentLayerGroup, self).__init__(
-            name, 'recurrent_layer_group', 0, inputs=[], device=device)
-
-
-@config_layer('switch_order')
-class SwitchOrderLayer(LayerBase):
-    def __init__(self, name, inputs, reshape, **xargs):
-        super(SwitchOrderLayer, self).__init__(
-            name, 'switch_order', 0, inputs=inputs, **xargs)
-        self.config.reshape_conf.height_axis.extend(reshape['height'])
-        self.config.reshape_conf.width_axis.extend(reshape['width'])
-        input_layer = self.get_input_layer(0)
-        if reshape is None:
-            self.set_layer_size(input_layer.size)
-        else:
-            in_h = input_layer.height
-            in_w = input_layer.width
-            out_dims = None
-            if input_layer.has_depth():
-                in_d = input_layer.depth
-                in_c = input_layer.size / in_h / in_w / in_d
-                # batch_size, depth, height, width, channel
-                out_dims = [0, in_d, in_h, in_w, in_c]
-            else:
-                in_c = input_layer.size / in_h / in_w
-                # batch_size, height, width, channel
-                out_dims = [0, in_h, in_w, in_c]
-            # Because (reshape['width'][0] > 0) always be true.
-            # So out_dims[0] won't be used.
-            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
-            self.set_layer_size(size)
-
-
-@config_layer('scale_sub_region')
-class ScaleSubRegionLayer(LayerBase):
-    def __init__(self, name, inputs, value, **xargs):
-        super(ScaleSubRegionLayer, self).__init__(
-            name, 'scale_sub_region', 0, inputs=inputs, **xargs)
-        scale_sub_region_conf = self.config.inputs[0].scale_sub_region_conf
-        scale_sub_region_conf.value = value
-
-        # get channel, width and height from input_0 layer
-        input_layer = self.get_input_layer(0)
-        image_conf = scale_sub_region_conf.image_conf
-        image_conf.img_size = input_layer.width
-        image_conf.img_size_y = input_layer.height
-        image_conf.channels = input_layer.size / (input_layer.width *
-                                                  input_layer.height)
-        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
-                           image_conf.channels)
-
-
-@config_layer('factorization_machine')
-class FactorizationMachineLayer(LayerBase):
-    def __init__(self, name, inputs, factor_size, **xargs):
-        super(FactorizationMachineLayer, self).__init__(
-            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
-        config_assert(
-            len(self.inputs) == 1,
-            'factorization machine layer must have one and only one input.')
-        self.config.factor_size = factor_size
-        input_layer = self.get_input_layer(0)
-        psize = input_layer.size * factor_size
-        dims = [input_layer.size, factor_size]
-        self.create_input_parameter(0, psize, dims)
-
-
-# Deprecated, use a new layer specific class instead
-@config_func
-def Layer(name, type, **xargs):
-    layers = {}
-    layers.update(g_cost_map)
-    layers.update(g_layer_type_map)
-    layer_func = layers.get(type)
-    config_assert(layer_func, "layer type '%s' not supported." % type)
-    return layer_func(name, **xargs)
-
-
-@config_func
-def ParameterHook(type, **kwargs):
-    if type == 'pruning':
-        hook = ParameterUpdaterHookConfig()
-        hook.type = type
-        sparsity_ratio = kwargs.get('sparsity_ratio', None)
-        if sparsity_ratio is not None:
-            hook.sparsity_ratio = sparsity_ratio
-        return hook
-    elif type == 'dpruning':
-        hook = ParameterUpdaterHookConfig()
-        hook.type = type
-        return hook
-    else:
-        return None
-
-
-@config_func
-def Parameter(name,
-              size,
-              device,
-              dims,
-              learning_rate=None,
-              momentum=None,
-              decay_rate=None,
-              decay_rate_l1=None,
-              initial_mean=None,
-              initial_std=None,
-              initial_strategy=None,
-              initial_smart=None,
-              num_batches_regularization=None,
-              sparse_remote_update=None,
-              sparse_update=None,
-              gradient_clipping_threshold=None,
-              sparse=None,
-              format=None,
-              need_compact=None,
-              is_static=None,
-              is_shared=None,
-              update_hooks=None,
-              initializer=None):
-
-    config_assert(name not in g_parameter_map,
-                  'Duplicated parameter name: ' + name)
-
-    para = g_config.model_config.parameters.add()
-    para.name = name
-    para.size = size
-    if device is not None:
-        para.device = int(device)
-    para.dims.extend(dims)
-
-    if learning_rate is not None:
-        para.learning_rate = float(learning_rate)
-
-    momentum = default(momentum, g_default_momentum)
-    if momentum is not None:
-        para.momentum = float(momentum)
-
-    config_assert(not momentum or not decay_rate_l1,
-                  "momentum and decay_rate_l1 cannot both be non-zero")
-
-    decay_rate = default(decay_rate, g_default_decay_rate)
-    if decay_rate is not None:
-        para.decay_rate = decay_rate
-
-    if decay_rate_l1 is not None:
-        para.decay_rate_l1 = decay_rate_l1
-    para.initial_std = default(initial_std, g_default_initial_std)
-    para.initial_mean = default(initial_mean, g_default_initial_mean)
-
-    num_batches_regularization = default(num_batches_regularization,
-                                         g_default_num_batches_regularization)
-    if num_batches_regularization is not None:
-        para.num_batches_regularization = int(num_batches_regularization)
-
-    if sparse_remote_update is not None:
-        para.sparse_remote_update = sparse_remote_update
-        if sparse_remote_update:
-            g_config.opt_config.use_sparse_remote_updater = True
-    if sparse_update is not None:
-        para.sparse_update = sparse_update
-    gradient_clipping_threshold = default(gradient_clipping_threshold,
-                                          g_default_gradient_clipping_threshold)
-    if gradient_clipping_threshold is not None:
-        para.gradient_clipping_threshold = gradient_clipping_threshold
-    para.initial_strategy = default(initial_strategy,
-                                    g_default_initial_strategy)
-    para.initial_smart = default(initial_smart, g_default_initial_smart)
-    if para.initial_smart:
-        para.initial_mean = 0.
-        if len(para.dims) != 0:
-            para.initial_std = 1. / math.sqrt(para.dims[0])
-        else:
-            print(
-                "Use initial_smart, but dims not set. Initial_smart may not be used in this layer"
-            )
-            traceback.print_exc()
-            para.initial_std = 1. / math.sqrt(para.size)
-    if g_default_compact_func is not None:
-        sparse, format, need_compact = g_default_compact_func(para.name)
-
-    if sparse is not None:
-        para.is_sparse = sparse
-    if format is not None:
-        para.format = format
-    if need_compact is not None:
-        para.need_compact = need_compact
-    if is_static is not None:
-        para.is_static = is_static
-    config_assert(not para.sparse_remote_update or not para.is_static,
-                  "sparse_remote_update and is_static cannot both be true")
-    if is_shared is not None:
-        para.is_shared = is_shared
-
-    update_hooks = default(update_hooks, g_default_update_hooks)
-
-    if update_hooks is not None:
-        if hasattr(update_hooks, '__call__'):
-            update_hooks = update_hooks()
-
-        if isinstance(update_hooks, list):
-            for hook in update_hooks:
-                para.update_hooks.extend([hook])
-        else:
-            para.update_hooks.extend([update_hooks])
-
-    g_parameter_map[name] = para
-    if initializer is not None:
-        config_assert(
-            callable(initializer),
-            "parameter initializer should be a callable object")
-        g_parameter_initializer_map[name] = initializer
-
-
-@config_func
-def default_initial_std(val):
-    global g_default_initial_std
-    g_default_initial_std = val
-
-
-@config_func
-def default_initial_mean(val):
-    global g_default_initial_mean
-    g_default_initial_mean = val
-
-
-@config_func
-def default_initial_strategy(val):
-    global g_default_initial_strategy
-    g_default_initial_strategy = val
-
-
-@config_func
-def default_initial_smart(val):
-    global g_default_initial_smart
-    g_default_initial_smart = val
-
-
-@config_func
-def default_momentum(val):
-    global g_default_momentum
-    g_default_momentum = val
-
-
-@config_func
-def default_decay_rate(val):
-    global g_default_decay_rate
-    g_default_decay_rate = val
-
-
-@config_func
-def default_num_batches_regularization(val):
-    global g_default_num_batches_regularization
-    g_default_num_batches_regularization = val
-
-
-@config_func
-def default_gradient_clipping_threshold(val):
-    global g_default_gradient_clipping_threshold
-    g_default_gradient_clipping_threshold = val
-
-
-@config_func
-def default_device(val):
-    global g_default_device
-    g_default_device = val
-
-
-@config_func
-def default_update_hooks(val):
-    global g_default_update_hooks
-    g_default_update_hooks = val
-
-
-@config_func
-def default_compact_func(val):
-    global g_default_compact_func
-    g_default_compact_func = val
-
-
-def make_importer(config_dir, config_args):
-    def Import(config_file, local_args={}):
-        if not config_file.startswith('/'):
-            config_file = config_dir + '/' + config_file
-            g_config.config_files.append(config_file)
-        execfile(config_file,
-                 make_config_environment(config_file, config_args), local_args)
-
-    return Import
-
-
-DEFAULT_SETTING = dict(
-    batch_size=None,
-    mini_batch_size=None,
-    algorithm='async_sgd',
-    async_lagged_grad_discard_ratio=1.5,
-    learning_method='momentum',
-    gradient_clipping_threshold=None,
-    num_batches_per_send_parameter=None,
-    num_batches_per_get_parameter=None,
-    center_parameter_update_method=None,
-    learning_rate=1.,
-    learning_rate_decay_a=0.,
-    learning_rate_decay_b=0.,
-    learning_rate_schedule='poly',
-    learning_rate_args='',
-    l1weight=0.1,
-    l2weight=0.,
-    l2weight_zero_iter=0,
-    c1=0.0001,
-    backoff=0.5,
-    owlqn_steps=10,
-    max_backoff=5,
-    average_window=0,
-    do_average_in_cpu=False,
-    max_average_window=None,
-    ada_epsilon=1e-6,
-    ada_rou=0.95,
-    delta_add_rate=1.0,
-    shrink_parameter_value=0,
-    adam_beta1=0.9,
-    adam_beta2=0.999,
-    adam_epsilon=1e-8, )
-
-settings = copy.deepcopy(DEFAULT_SETTING)
-
-settings_deprecated = dict(usage_ratio=1., )
-
-trainer_settings = dict(
-    save_dir="./output/model",
-    init_model_path=None,
-    start_pass=0, )
-
-
-@config_func
-def Settings(**args):
-    for k, v in args.iteritems():
-        if k == "usage_ratio":
-            logger.warning(
-                "Deprecated: define usage_ratio in DataConfig instead")
-            if g_config.HasField("data_config"):
-                g_config.data_config.__setattr__(k, v)
-            settings_deprecated[k] = v
-            continue
-        elif k in settings:
-            settings[k] = v
-        elif k in trainer_settings:
-            trainer_settings[k] = v
-        else:
-            logger.fatal('Unkown setting: %s' % k)
-
-
-@config_func
-def cluster_config(**args):
-    pass
-
-
-@config_func
-def EnableSubmodelSuffix(flag=True):
-    """
-    If enabled, the layer and evaluator names in submodel will be automatically
-    appended with @submodel_name
-    """
-    global g_add_submodel_suffix
-    g_add_submodel_suffix = flag
-
-
-def make_config_environment(config_file, config_args):
-    def make_setter(k):
-        def setter(v):
-            logger.fatal("Obsolete: use Settings(%s=%s, ...) instead" % (k, v))
-
-        return setter
-
-    funcs = {}
-    funcs.update(g_config_funcs)
-
-    for k in settings.iterkeys():
-        funcs[k] = make_setter(k)
-    for k in settings_deprecated.iterkeys():
-        funcs[k] = make_setter(k)
-    config_dir = os.path.dirname(config_file)
-    if not config_dir:
-        config_dir = '.'
-
-    funcs.update(
-        Import=make_importer(config_dir, config_args),
-        get_config_arg=make_get_config_arg(config_args), )
-
-    funcs.update(g_extended_config_funcs)
-
-    return funcs
-
-
-def make_get_config_arg(config_args):
-    def get_config_arg(name, type, default=None):
-        if type == bool:
-            s = config_args.get(name)
-            if not s:
-                return default
-            if s == 'True' or s == '1' or s == 'true':
-                return True
-            if s == 'False' or s == '0' or s == 'false':
-                return False
-            raise ValueError('Value of config_arg %s is not boolean' % name)
-        else:
-            return type(config_args.get(name, default))
-
-    return get_config_arg
-
-
-def importlib(name):
-    __import__(name)
-    return sys.modules[name]
-
-
-def find_caller():
-    stack = traceback.extract_stack()
-    for s in stack[-4::-1]:
-        if not s[0].endswith('config_parser.py'):
-            return s[0], s[1], s[2]
-    return "(unknown file)", 0, "(unknown function)"
-
-
-def my_fatal(s):
-    logger.critical(s)
-    raise Exception()
-
-
-_parse_config_hooks = set()
-
-
-def register_parse_config_hook(f):
-    """
-    Register a hook function for parse_config. parse_config will invoke the hook
-    at the beginning of parse. This make it possible to reset global state for
-    for constructing the model.
-    """
-    _parse_config_hooks.add(f)
-
-
-def update_g_config():
-    '''
-    Update g_config after execute config_file or config_functions.
-    '''
-    for k, v in settings.iteritems():
-        if v is None:
-            continue
-        g_config.opt_config.__setattr__(k, v)
-
-    for k, v in trainer_settings.iteritems():
-        if v is None:
-            continue
-        g_config.__setattr__(k, v)
-
-    for name in g_config.model_config.input_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-        assert (g_layer_map[name].type == "data" or g_layer_map[name].type == "data_trim"), \
-            'The type of input layer "%s" is not "data"' % name
-    for name in g_config.model_config.output_layer_names:
-        assert name in g_layer_map, \
-            'input name "%s" does not correspond to a layer name' % name
-    return g_config
-
-
-def begin_parse():
-    init_config_environment()
-    for hook in _parse_config_hooks:
-        hook()
-
-    logger.findCaller = find_caller
-    logger.fatal = my_fatal
-
-    g_config.model_config.type = "nn"
-
-    global g_current_submodel, g_root_submodel
-    g_root_submodel = g_config.model_config.sub_models.add()
-    g_root_submodel.name = 'root'
-    g_root_submodel.is_recurrent_layer_group = False
-    g_current_submodel = g_root_submodel
-
-
-def parse_config(trainer_config, config_arg_str):
-    '''
-    @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
-    passed to config script as a dictionary CONFIG_ARGS
-    '''
-
-    begin_parse()
-    config_args = {}
-
-    if config_arg_str:
-        config_args = dict([f.split('=') for f in config_arg_str.split(',')])
-
-    global g_command_config_args
-    g_command_config_args.update(config_args)
-
-    extension_module_name = config_args.get('extension_module_name')
-    if extension_module_name:
-        global g_extended_config_funcs
-        extension_module = importlib(extension_module_name)
-        g_extended_config_funcs = extension_module.get_config_funcs(g_config)
-
-    if hasattr(trainer_config, '__call__'):
-        trainer_config.func_globals.update(
-            make_config_environment("", config_args))
-        trainer_config()
-    else:
-        execfile(trainer_config,
-                 make_config_environment(trainer_config, config_args))
-
-    return update_g_config()
-
-
-def parse_config_and_serialize(trainer_config, config_arg_str):
-    try:
-        config = parse_config(trainer_config, config_arg_str)
-        #logger.info(config)
-        return config.SerializeToString()
-    except:
-        traceback.print_exc()
-        raise
-
-
-if __name__ == '__main__':
-    try:
-        config = parse_config(sys.argv[1], '')
-        config.SerializeToString()
-        __real_print__(str(config))
-    except:
-        traceback.print_exc()
-        raise
diff --git a/python/paddle/trainer/config_parser_extension.py b/python/paddle/trainer/config_parser_extension.py
deleted file mode 100644
index b9e0f3eb1..000000000
--- a/python/paddle/trainer/config_parser_extension.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.proto.DataConfig_pb2 import DataConfig
-
-g_config = None
-
-
-def SimpleData(files=None,
-               feat_dim=None,
-               context_len=None,
-               buffer_capacity=None):
-
-    data_config = DataConfig()
-    data_config.type = 'simple'
-    data_config.files = files
-    data_config.feat_dim = feat_dim
-    if context_len is not None:
-        data_config.context_len = context_len
-    if buffer_capacity:
-        data_config.buffer_capacity = buffer_capacity
-    return data_config
-
-
-def get_config_funcs(trainer_config):
-    global g_config
-    g_config = trainer_config
-    return dict(SimpleData=SimpleData)
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
deleted file mode 100644
index ef92107a1..000000000
--- a/python/paddle/trainer/recurrent_units.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# recurrent_units.py
-# Version 2.0
-#
-# Some recurrent units can be used in recurrent layer group,
-#   to use these units, import this module in your config_file:
-#     import trainer.recurrent_units
-#
-# The modules in this file are DEPRECATED.
-# If you would like to use lstm/gru
-# please use the functions defined in paddle.trainer_config_helpers.
-
-from paddle.trainer.config_parser import *
-
-
-# long short term memory, can be used in recurrent machine
-# *inputs* must be a list of Projections, for example:
-#   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of
-#   two LstmRecurrentUnit is same, they share same parameters
-# *out_memory* can be defined outside if it's used outside
-def LstmRecurrentUnit(name,
-                      size,
-                      active_type,
-                      state_active_type,
-                      gate_active_type,
-                      inputs,
-                      para_prefix=None,
-                      error_clipping_threshold=0,
-                      out_memory=None):
-
-    if para_prefix is None:
-        para_prefix = name
-    if out_memory is None:
-        out_memory = Memory(name=name, size=size)
-
-    state_memory = Memory(name=name + "_" + "state", size=size)
-
-    Layer(
-        name=name + "_" + "input_recurrent",
-        type="mixed",
-        size=size * 4,  #(input_s, input_gate, forget_gate, output_gate)
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_input_recurrent.b"),
-        inputs=inputs + [
-            FullMatrixProjection(
-                out_memory, parameter_name=para_prefix + "_input_recurrent.w"),
-        ], )
-    LstmStepLayer(
-        name=name,
-        size=size,
-        bias=Bias(parameter_name=para_prefix + "_check.b"),
-        inputs=[name + "_" + "input_recurrent", state_memory],
-        active_type=active_type,
-        active_gate_type=gate_active_type,
-        active_state_type=state_active_type, )
-    GetOutputLayer(
-        name=name + "_" + "state",
-        size=size,
-        inputs=Input(
-            name, input_layer_argument="state"), )
-
-
-def LstmRecurrentUnitNaive(name,
-                           size,
-                           active_type,
-                           state_active_type,
-                           gate_active_type,
-                           inputs,
-                           para_prefix=None,
-                           error_clipping_threshold=0,
-                           out_memory=None):
-
-    if para_prefix is None:
-        para_prefix = name
-    if out_memory is None:
-        out_memory = Memory(name=name, size=size)
-
-    state_memory = Memory(name=name + "_" + "state", size=size)
-
-    Layer(
-        name=name + "_" + "input_recurrent",
-        type="mixed",
-        size=size * 4,  #(input_s, input_gate, forget_gate, output_gate)
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_input_recurrent.b"),
-        inputs=inputs + [
-            FullMatrixProjection(
-                out_memory, parameter_name=para_prefix + "_input_recurrent.w"),
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "input_s",
-        size=size,
-        active_type=active_type,
-        inputs=[
-            IdentityOffsetProjection(
-                name + "_" + "input_recurrent", offset=0)
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "input_gate",
-        active_type=gate_active_type,
-        inputs=[
-            IdentityOffsetProjection(
-                name + "_" + "input_recurrent", offset=size), DotMulProjection(
-                    state_memory, parameter_name=para_prefix + "_input_check.w")
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "forget_gate",
-        active_type=gate_active_type,
-        inputs=[
-            IdentityOffsetProjection(
-                name + "_" + "input_recurrent", offset=size * 2),
-            DotMulProjection(
-                state_memory, parameter_name=para_prefix + "_forget_check.w")
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "state",
-        inputs=[
-            DotMulOperator([name + "_" + "input_s", name + "_" + "input_gate"]),
-            DotMulOperator([state_memory, name + "_" + "forget_gate"]),
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "output_gate",
-        active_type=gate_active_type,
-        inputs=[
-            IdentityOffsetProjection(
-                name + "_" + "input_recurrent", offset=size * 3),
-            DotMulProjection(
-                name + "_" + "state",
-                parameter_name=para_prefix + "_output_check.w")
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "state_atv",
-        active_type=state_active_type,
-        inputs=IdentityProjection(name + "_" + "state"), )
-    ExpressionLayer(
-        name=name,
-        inputs=DotMulOperator(
-            [name + "_" + "state_atv", name + "_" + "output_gate"]), )
-
-
-# like LstmRecurrentUnit, but it's a layer group.
-# it is equivalent to LstmLayer
-def LstmRecurrentLayerGroup(name,
-                            size,
-                            active_type,
-                            state_active_type,
-                            gate_active_type,
-                            inputs,
-                            para_prefix=None,
-                            error_clipping_threshold=0,
-                            seq_reversed=False):
-
-    input_layer_name = name + "_" + "transform_input"
-    Layer(
-        name=input_layer_name,
-        type="mixed",
-        size=size * 4,
-        active_type="",
-        bias=False,
-        inputs=inputs, )
-
-    RecurrentLayerGroupBegin(
-        name + "_layer_group",
-        in_links=[input_layer_name],
-        out_links=[name],
-        seq_reversed=seq_reversed)
-
-    LstmRecurrentUnit(
-        name=name,
-        size=size,
-        active_type=active_type,
-        state_active_type=state_active_type,
-        gate_active_type=gate_active_type,
-        inputs=[IdentityProjection(input_layer_name)],
-        para_prefix=para_prefix,
-        error_clipping_threshold=error_clipping_threshold, )
-
-    RecurrentLayerGroupEnd(name + "_layer_group")
-
-
-# gated recurrent unit, can be used in recurrent machine
-# *inputs* should be a list of Projections, for example:
-#   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of
-#   two GatedRecurrentUnit is same, they share same parameters
-# *out_memory* can be defined outside if it's used outside
-
-
-def GatedRecurrentUnit(name,
-                       size,
-                       active_type,
-                       gate_active_type,
-                       inputs,
-                       para_prefix=None,
-                       error_clipping_threshold=0,
-                       out_memory=None):
-    if type_of(inputs) == str:  #only used by GatedRecurrentLayerGroup
-        input_layer_name = inputs
-    else:
-        input_layer_name = name + "_" + "transform_input"
-        Layer(
-            name=input_layer_name,
-            type="mixed",
-            size=size * 3,
-            active_type="",
-            bias=False,
-            inputs=inputs, )
-
-    if para_prefix is None:
-        para_prefix = name
-    if out_memory is None:
-        out_memory = Memory(name=name, size=size)
-
-    GruStepLayer(
-        name=name,
-        size=size,
-        bias=Bias(parameter_name=para_prefix + "_gate.b"),
-        inputs=[
-            input_layer_name, Input(
-                out_memory, parameter_name=para_prefix + "_gate.w")
-        ],
-        active_type=active_type,
-        active_gate_type=gate_active_type, )
-
-
-def GatedRecurrentUnitNaive(name,
-                            size,
-                            active_type,
-                            gate_active_type,
-                            inputs,
-                            para_prefix=None,
-                            error_clipping_threshold=0,
-                            out_memory=None):
-
-    if type_of(inputs) == str:  #only used by GatedRecurrentLayerGroup
-        input_layer_name = inputs
-    else:
-        input_layer_name = name + "_" + "transform_input"
-        Layer(
-            name=input_layer_name,
-            type="mixed",
-            size=size * 3,
-            active_type="",
-            bias=False,
-            inputs=inputs, )
-
-    if para_prefix is None:
-        para_prefix = name
-    if out_memory is None:
-        out_memory = Memory(name=name, size=size)
-
-    Layer(
-        name=name + "_" + "update_gate",
-        type="mixed",
-        size=size,
-        active_type=gate_active_type,
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_update_gate.b"),
-        inputs=[
-            IdentityOffsetProjection(
-                input_layer_name, offset=0), FullMatrixProjection(
-                    out_memory, parameter_name=para_prefix + "_update_gate.w")
-        ], )
-    Layer(
-        name=name + "_" + "reset_gate",
-        type="mixed",
-        size=size,
-        active_type=gate_active_type,
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_reset_gate.b"),
-        inputs=[
-            IdentityOffsetProjection(
-                input_layer_name, offset=size), FullMatrixProjection(
-                    out_memory, parameter_name=para_prefix + "_reset_gate.w")
-        ], )
-    ExpressionLayer(
-        name=name + "_" + "reset_output",
-        inputs=DotMulOperator([out_memory, name + "_" + "reset_gate"]), )
-    Layer(
-        name=name + "_" + "output_candidate",
-        type="mixed",
-        size=size,
-        active_type=active_type,
-        error_clipping_threshold=error_clipping_threshold,
-        bias=Bias(
-            initial_std=0, parameter_name=para_prefix + "_output_candidate.b"),
-        inputs=[
-            IdentityOffsetProjection(
-                input_layer_name, offset=size * 2), FullMatrixProjection(
-                    name + "_" + "reset_output",
-                    parameter_name=para_prefix + "_output_candidate.w")
-        ], )
-    ExpressionLayer(  #element-wise interpolation
-        name=name,
-        inputs=[
-            IdentityProjection(out_memory),
-            DotMulOperator(
-                [out_memory, name + "_" + "update_gate"], scale=-1.0),
-            DotMulOperator(
-                [name + "_" + "output_candidate", name + "_" + "update_gate"]),
-        ], )
-
-
-# like GatedRecurrentUnit, but it's a layer group.
-# it is equivalent to GatedRecurrentLayer.
-def GatedRecurrentLayerGroup(name,
-                             size,
-                             active_type,
-                             gate_active_type,
-                             inputs,
-                             para_prefix=None,
-                             error_clipping_threshold=0,
-                             seq_reversed=False):
-
-    input_layer_name = name + "_" + "transform_input"
-    Layer(
-        name=input_layer_name,
-        type="mixed",
-        size=size * 3,
-        active_type="",
-        bias=False,
-        inputs=inputs, )
-
-    RecurrentLayerGroupBegin(
-        name + "_layer_group",
-        in_links=[input_layer_name],
-        out_links=[name],
-        seq_reversed=seq_reversed)
-
-    GatedRecurrentUnit(
-        name=name,
-        size=size,
-        active_type=active_type,
-        gate_active_type=gate_active_type,
-        inputs=input_layer_name,  #transform outside
-        para_prefix=para_prefix,
-        error_clipping_threshold=error_clipping_threshold, )
-
-    RecurrentLayerGroupEnd(name + "_layer_group")
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
deleted file mode 100644
index 13155ebdd..000000000
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from activations import *
-from data_sources import *
-from poolings import *
-from evaluators import *
-from layers import *
-from networks import *
-from optimizers import *
-from attrs import *
-from config_parser_utils import *
-# This will enable operator overload for LayerOutput
-import layer_math
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
deleted file mode 100644
index 368396826..000000000
--- a/python/paddle/trainer_config_helpers/activations.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    "TanhActivation", "SigmoidActivation", "SoftmaxActivation",
-    "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
-    'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
-    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
-    "LogActivation", "SqrtActivation", "ReciprocalActivation",
-    "SoftSignActivation"
-]
-
-
-class BaseActivation(object):
-    """
-    A mark for activation class.
-    Each activation inherit BaseActivation, which has two parameters.
-
-    :param name: activation name in paddle config.
-    :type name: basestring
-    :param support_hppl: True if supported by hppl. HPPL is a library used by paddle
-                         internally. Currently, lstm layer can only use activations
-                         supported by hppl.
-    :type support_hppl: bool
-    """
-
-    def __init__(self, name, support_hppl):
-        self.name = name
-        self.support_hppl = support_hppl
-
-    def __repr__(self):
-        return self.name
-
-
-class TanhActivation(BaseActivation):
-    """
-    Tanh activation.
-
-    .. math::
-
-       f(z)=tanh(z)=\\frac{e^z-e^{-z}}{e^z+e^{-z}}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'tanh', True)
-
-
-class SigmoidActivation(BaseActivation):
-    """
-    Sigmoid activation.
-
-    .. math::
-
-       f(z) = \\frac{1}{1+exp(-z)}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'sigmoid', True)
-
-
-class SoftmaxActivation(BaseActivation):
-    """
-    Softmax activation for simple input
-
-
-
-    .. math::
-
-       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} }
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'softmax', False)
-
-
-class SequenceSoftmaxActivation(BaseActivation):
-    """
-    Softmax activation for one sequence. The dimension of input feature must be
-    1 and a sequence.
-
-    ..  code:: python
-
-        result = softmax(for each_feature_vector[0] in input_feature)
-        for i, each_time_step_output in enumerate(output):
-            each_time_step_output = result[i]
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'sequence_softmax', False)
-
-
-class IdentityActivation(BaseActivation):
-    """
-    Identity Activation.
-
-    Just do nothing for output both forward/backward.
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, '', False)
-
-
-LinearActivation = IdentityActivation
-
-
-class ReluActivation(BaseActivation):
-    """
-    Relu activation.
-
-    forward. :math:`y = max(0, z)`
-
-    derivative:
-
-    .. math::
-
-       1  &\\quad if z > 0 \\\\
-       0  &\\quad\\mathrm{otherwize}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'relu', True)
-
-
-class BReluActivation(BaseActivation):
-    """
-    BRelu Activation.
-
-    forward.  :math:`y = min(24, max(0, z))`
-
-    derivative:
-
-    .. math::
-
-       1  &\\quad if 0 < z < 24 \\\\
-       0  &\\quad \\mathrm{otherwise}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'brelu', False)
-
-
-class SoftReluActivation(BaseActivation):
-    """
-    SoftRelu Activation.
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'softrelu', False)
-
-
-class STanhActivation(BaseActivation):
-    """
-    Scaled Tanh Activation.
-
-    .. math::
-
-       f(z) = 1.7159 * tanh(2/3*z)
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'stanh', False)
-
-
-class AbsActivation(BaseActivation):
-    """
-    Abs Activation.
-
-    Forward:    :math:`f(z) = abs(z)`
-
-    Derivative:
-
-    .. math::
-
-       1 &\\quad if \\quad z > 0 \\\\
-       -1 &\\quad if \\quad z < 0 \\\\
-       0 &\\quad if \\quad z = 0
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'abs', False)
-
-
-class SquareActivation(BaseActivation):
-    """
-    Square Activation.
-
-    .. math::
-       f(z) = z^2.
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'square', False)
-
-
-class ExpActivation(BaseActivation):
-    """
-    Exponential Activation.
-
-    .. math::
-       f(z) = e^z.
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'exponential', False)
-
-
-class LogActivation(BaseActivation):
-    """
-    Logarithm Activation.
-
-    .. math::
-       f(z) = log(z)
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'log', False)
-
-
-class SqrtActivation(BaseActivation):
-    """
-    Square Root Activation.
-
-    .. math::
-       f(z) = sqrt(z)
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'sqrt', False)
-
-
-class ReciprocalActivation(BaseActivation):
-    """
-    Reciprocal Activation.
-
-    .. math::
-       f(z)=\\frac{1}{z}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'reciprocal', False)
-
-
-class SoftSignActivation(BaseActivation):
-    """
-    SoftSign Activation.
-
-    .. math::
-       f(z)=\\frac{z}{1 + |z|}
-    """
-
-    def __init__(self):
-        BaseActivation.__init__(self, 'softsign', False)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
deleted file mode 100644
index 4e3beaf63..000000000
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import *
-__all__ = [
-    'HookAttr', 'ParamAttr', 'ExtraAttr', 'ParameterAttribute',
-    'ExtraLayerAttribute'
-]
-
-
-def convert_and_compare(x, Type):
-    """
-    Convert x to be the same type as Type and then convert back to
-    check whether there is a loss of information
-    :param x: object to be checked
-    :param Type: target type to check x over
-
-    """
-    return type(x)(Type(x)) == x
-
-
-def is_compatible_with(x, Type):
-    """
-    Check if x has a type compatible with Type
-    :param x: object to be checked
-    :param Type: target type to check x over
-
-    """
-    if type(x) == Type:
-        return True
-    try:
-        if float == Type or int == Type:
-            # avoid those types that can be converted to float/int but not very
-            # meaningful and  could potentially lead to error
-            # i.e., str and bool typed value should not be used for initializing float/int variable
-            if not isinstance(x, str) and not isinstance(x, bool):
-                return convert_and_compare(x, Type)
-        elif bool == Type:
-            # should not use string type to initialize bool variable
-            if not isinstance(x, str):
-                return convert_and_compare(x, Type)
-        else:
-            return False
-    except:
-        return False
-
-
-class HookAttribute(object):
-    """
-    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs
-    during training process of a layer with parameters, such as img_conv layer, fc layer.
-
-    :param  type: Hook type, currently supported types:
-                        'pruning' :  user specify a sparsity_ratio before training started, and the
-                            network will prune the parameters based on the sparsity_ratio.
-                            eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6)
-                            The specific usage can be paddle.layer.img_conv(input=img, filter_size=3,
-                                                                       num_channels=3, num_filters=64,
-                                                                       param_attr=ParameterAttribute(update_hooks=hk) )
-                            The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf
-    :type type: string
-
-    :param sparsity_ratio: Must be specified if hook type is 'pruning',
-                        it represents the ratio of the zero elements to be set by the Parameter.
-    :type sparsity_ratio: float or None
-
-    """
-
-    def __init__(self, type, sparsity_ratio=None):
-        self.type = type
-        self.sparsity_ratio = sparsity_ratio
-        if self.sparsity_ratio is not None:
-            assert is_compatible_with(
-                self.sparsity_ratio,
-                float), 'sparisity_ratio must be float type'
-            assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparsity_ratio must be a float between [0, 1] '
-
-    def __call__(self):
-        return ParameterHook(self.type, sparsity_ratio=self.sparsity_ratio)
-
-
-class ParameterAttribute(object):
-    """
-    Parameter Attributes object. To fine-tuning network training process, user
-    can set attribute to control training details, such as l1,l2 rate / learning
-    rate / how to init param.
-
-    NOTE: IT IS A HIGH LEVEL USER INTERFACE.
-
-    :param is_static: True if this parameter will be fixed while training.
-    :type is_static: bool
-
-    :param initial_std: Gauss Random initialization standard deviation.
-                        None if not using Gauss Random initialize parameter.
-    :type initial_std: float or None
-    :param initial_mean:  Gauss Random initialization mean.
-                         None if not using Gauss Random initialize parameter.
-    :type initial_mean: float or None
-    :param initial_max: Uniform initialization max value.
-    :type initial_max: float or None
-    :param initial_min: Uniform initialization min value.
-    :type initial_min: float or None
-    :param l1_rate: the l1 regularization factor
-    :type l1_rate: float or None
-    :param l2_rate: the l2 regularization factor
-    :type l2_rate: float or None
-    :param learning_rate: The parameter learning rate. None means 1.
-                          The learning rate when optimize is LEARNING_RATE =
-                          GLOBAL_LEARNING_RATE * PARAMETER_LEARNING_RATE
-                          * SCHEDULER_FACTOR.
-
-    :type learning_rate: float or None
-    :param momentum: The parameter momentum. None means use global value.
-    :type momentum: float or None
-    :param gradient_clipping_threshold: gradient clipping threshold. If gradient
-                                        value larger than some value, will be
-                                        clipped.
-    :type gradient_clipping_threshold: float
-    :param sparse_update: Enable sparse update for this parameter. It will
-                          enable both local and remote sparse update.
-    :type sparse_update: bool
-    :param update_hooks: A HookAttribute object.
-    :type update_hooks: HookAttribute
-    :param initializer: If not None, it should be a callable object which accepts
-                        a parameter name and returns numpy array for the initial
-                        value of the parameter
-    :type initializer: callable object
-    """
-
-    def __init__(self,
-                 name=None,
-                 is_static=False,
-                 initial_std=None,
-                 initial_mean=None,
-                 initial_max=None,
-                 initial_min=None,
-                 l1_rate=None,
-                 l2_rate=None,
-                 learning_rate=None,
-                 momentum=None,
-                 gradient_clipping_threshold=None,
-                 sparse_update=False,
-                 update_hooks=None,
-                 initializer=None):
-        self.attr = {}
-
-        if is_static:
-            self.attr['is_static'] = True
-
-        if initial_std is None and initial_mean is None and initial_max \
-                is None and initial_min is None:
-            self.attr['initial_smart'] = True
-        elif is_compatible_with(initial_std, float) or \
-             is_compatible_with(initial_mean, float):
-            if initial_std is not None:
-                self.attr['initial_std'] = initial_std
-            if initial_mean is not None:
-                self.attr['initial_mean'] = initial_mean
-            self.attr['initial_strategy'] = 0  # Gauss Random
-        elif is_compatible_with(initial_max, float) and \
-             is_compatible_with(initial_min, float):
-            initial_max = initial_max
-            initial_min = initial_min
-            assert initial_min < initial_max
-            initial_mean = (initial_max + initial_min) / 2
-            initial_std = initial_mean - initial_min
-            self.attr['initial_mean'] = initial_mean
-            self.attr['initial_std'] = initial_std
-            self.attr['initial_strategy'] = 1  # Uniform Random
-        else:
-            raise RuntimeError("Unexpected branch.")
-
-        if not is_static and is_compatible_with(l1_rate, float):
-            self.attr['decay_rate_l1'] = l1_rate
-
-        if not is_static and is_compatible_with(l2_rate, float):
-            self.attr['decay_rate'] = l2_rate
-
-        if not is_static and is_compatible_with(learning_rate, float):
-            self.attr['learning_rate'] = learning_rate
-
-        if not is_static and is_compatible_with(momentum, float):
-            self.attr['momentum'] = momentum
-
-        if name is not None:
-            self.attr['parameter_name'] = name
-
-        if sparse_update:
-            self.attr['sparse_update'] = True
-            self.attr['sparse_remote_update'] = True
-
-        if gradient_clipping_threshold is not None and \
-                is_compatible_with(gradient_clipping_threshold, float):
-            self.attr['gradient_clipping_threshold'] = \
-                gradient_clipping_threshold
-        if initializer is not None:
-            self.attr['initializer'] = initializer
-
-        if update_hooks:
-            self.attr['update_hooks'] = update_hooks
-
-    def set_default_parameter_name(self, name):
-        """
-        Set default parameter name. If parameter not set, then will use default
-        parameter name.
-
-
-        :param name: default parameter name.
-        :type name: basestring
-        """
-        if 'parameter_name' not in self.attr:
-            self.attr['parameter_name'] = name
-
-    @staticmethod
-    def to_bias(bias_attr):
-        if isinstance(bias_attr, ParameterAttribute):
-            return Bias(**bias_attr.attr)
-        else:
-            return False
-
-
-class ExtraLayerAttribute(object):
-    """
-    Some high level layer attributes config. You can set all attributes here,
-    but some layer doesn't support all attributes. If you set an attribute to a
-    layer that not support this attribute, paddle will print an error and core.
-
-    :param error_clipping_threshold: Error clipping threshold.
-    :type error_clipping_threshold: float
-    :param drop_rate: Dropout rate. Dropout will create a mask on layer output.
-                      The dropout rate is the zero rate of this mask. The
-                      details of what dropout is please refer to `JMLRdropout
-                      <https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf
-                      >`_.
-    :type drop_rate: float
-    :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
-                   The details allocation in parallel_nn please refer to `use_case
-                   <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2
-                   /howto/cmd_parameter/use_case_en.md#case-2-specify-layers-in
-                   -different-devices>`_.
-    :type device: int
-    """
-
-    def __init__(self,
-                 error_clipping_threshold=None,
-                 drop_rate=None,
-                 device=None):
-        self.attr = dict()
-        if error_clipping_threshold is not None:
-            error_clipping_threshold = float(error_clipping_threshold)
-            if error_clipping_threshold < 0:
-                raise ValueError("Error clipping must > 0")
-            self.attr['error_clipping_threshold'] = error_clipping_threshold
-        if drop_rate is not None:
-            drop_rate = float(drop_rate)
-            if drop_rate < 0:
-                raise ValueError("Dropout rate must > 0")
-            self.attr["drop_rate"] = drop_rate
-
-        if isinstance(device, int):
-            self.attr["device"] = device
-
-    def check(self, layer_name):
-        for key in self.attr:
-            if not hasattr(self, 'can_%s' % key) or \
-                    not getattr(self, 'can_%s' % key):
-                raise NotImplementedError("Layer %s does not support %s" %
-                                          (layer_name, key))
-
-    @staticmethod
-    def to_kwargs(attr):
-        if attr is None:
-            return dict()
-        else:
-            return attr.attr
-
-
-HookAttr = HookAttribute
-ParamAttr = ParameterAttribute
-ExtraAttr = ExtraLayerAttribute
diff --git a/python/paddle/trainer_config_helpers/config_parser_utils.py b/python/paddle/trainer_config_helpers/config_parser_utils.py
deleted file mode 100644
index ee5bbbfb2..000000000
--- a/python/paddle/trainer_config_helpers/config_parser_utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import paddle.trainer.config_parser as config_parser
-from paddle.proto.TrainerConfig_pb2 import OptimizationConfig
-'''
-This file is a wrapper of formal config_parser. The main idea of this file is to
-separete different config logic into different function, such as network configuration
- and optimizer configuration.
-'''
-
-__all__ = [
-    "parse_trainer_config", "parse_network_config", "parse_optimizer_config",
-    "reset_parser"
-]
-
-
-def parse_trainer_config(trainer_conf, config_arg_str):
-    return config_parser.parse_config(trainer_conf, config_arg_str)
-
-
-def parse_network_config(network_conf, config_arg_str=''):
-    config = config_parser.parse_config(network_conf, config_arg_str)
-    return config.model_config
-
-
-def parse_optimizer_config(optimizer_conf, config_arg_str=''):
-    config_parser.settings = copy.deepcopy(config_parser.DEFAULT_SETTING)
-    optimizer_conf()
-    opt_config = OptimizationConfig()
-    for k, v in config_parser.settings.iteritems():
-        if v is None:
-            continue
-        opt_config.__setattr__(k, v)
-    return opt_config
-
-
-def reset_parser():
-    config_parser.begin_parse()
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
deleted file mode 100644
index a2a32d848..000000000
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Data Sources are helpers to define paddle training data or testing data.
-"""
-from paddle.trainer.config_parser import *
-from .utils import deprecated
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import six.moves.cPickle as pickle
-
-__all__ = ['define_py_data_sources2']
-
-
-def define_py_data_source(file_list,
-                          cls,
-                          module,
-                          obj,
-                          args=None,
-                          async=False,
-                          data_cls=PyData):
-    """
-    Define a python data source.
-
-    For example, the simplest usage in trainer_config.py as follow:
-
-    ..  code-block:: python
-
-        define_py_data_source("train.list", TrainData, "data_provider", "process")
-
-    Or. if you want to pass arguments from trainer_config to data_provider.py, then
-
-    ..  code-block:: python
-
-        define_py_data_source("train.list", TrainData, "data_provider", "process",
-                              args={"dictionary": dict_name})
-
-    :param data_cls:
-    :param file_list: file list name, which contains all data file paths
-    :type file_list: basestring
-    :param cls: Train or Test Class.
-    :type cls: TrainData or TestData
-    :param module: python module name.
-    :type module: basestring
-    :param obj: python object name. May be a function name if using
-                PyDataProviderWrapper.
-    :type obj: basestring
-    :param args: The best practice is using dict to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to
-                 receive arguments.
-    :type args: string or picklable object
-    :param async: Load Data asynchronously or not.
-    :type async: bool
-    :return: None
-    :rtype: None
-    """
-    if isinstance(file_list, list):
-        file_list_name = 'train.list'
-        if cls == TestData:
-            file_list_name = 'test.list'
-        with open(file_list_name, 'w') as f:
-            f.writelines(file_list)
-        file_list = file_list_name
-
-    if not isinstance(args, basestring) and args is not None:
-        args = pickle.dumps(args, 0)
-
-    cls(
-        data_cls(
-            files=file_list,
-            load_data_module=module,
-            load_data_object=obj,
-            load_data_args=args,
-            async_load_data=async))
-
-
-def define_py_data_sources(train_list,
-                           test_list,
-                           module,
-                           obj,
-                           args=None,
-                           train_async=False,
-                           data_cls=PyData):
-    """
-    The annotation is almost the same as define_py_data_sources2, except that
-    it can specific train_async and data_cls.
-
-    :param data_cls:
-    :param train_list: Train list name.
-    :type train_list: basestring
-    :param test_list: Test list name.
-    :type test_list: basestring
-    :param module: python module name. If train and test is different, then
-                   pass a tuple or list to this argument.
-    :type module: basestring or tuple or list
-    :param obj: python object name. May be a function name if using
-                PyDataProviderWrapper. If train and test is different, then pass
-                a tuple or list to this argument.
-    :type obj: basestring or tuple or list
-    :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive
-                 arguments. If train and test is different, then pass a tuple
-                 or list to this argument.
-    :type args: string or picklable object or list or tuple.
-    :param train_async: Is training data load asynchronously or not.
-    :type train_async: bool
-    :return: None
-    :rtype: None
-    """
-
-    def __is_splitable__(o):
-        return (isinstance(o, list) or
-                isinstance(o, tuple)) and hasattr(o, '__len__') and len(o) == 2
-
-    assert train_list is not None or test_list is not None
-    assert module is not None and obj is not None
-
-    test_module = module
-    train_module = module
-    if __is_splitable__(module):
-        train_module, test_module = module
-
-    test_obj = obj
-    train_obj = obj
-    if __is_splitable__(obj):
-        train_obj, test_obj = obj
-
-    if args is None:
-        args = ""
-
-    train_args = args
-    test_args = args
-    if __is_splitable__(args):
-        train_args, test_args = args
-
-    if train_list is not None:
-        define_py_data_source(train_list, TrainData, train_module, train_obj,
-                              train_args, train_async, data_cls)
-
-    if test_list is not None:
-        define_py_data_source(test_list, TestData, test_module, test_obj,
-                              test_args, False, data_cls)
-
-
-def define_py_data_sources2(train_list, test_list, module, obj, args=None):
-    """
-    Define python Train/Test data sources in one method. If train/test use
-    the same Data Provider configuration, module/obj/args contain one argument,
-    otherwise contain a list or tuple of arguments. For example\:
-
-    ..  code-block:: python
-
-        define_py_data_sources2(train_list="train.list",
-                                test_list="test.list",
-                                module="data_provider"
-                                # if train/test use different configurations,
-                                # obj=["process_train", "process_test"]
-                                obj="process",
-                                args={"dictionary": dict_name})
-
-    The related data provider can refer to :ref:`api_pydataprovider2_sequential_model` .
-
-    :param train_list: Train list name.
-    :type train_list: basestring
-    :param test_list: Test list name.
-    :type test_list: basestring
-    :param module: python module name. If train and test is different, then
-                   pass a tuple or list to this argument.
-    :type module: basestring or tuple or list
-    :param obj: python object name. May be a function name if using
-                PyDataProviderWrapper. If train and test is different, then pass
-                a tuple or list to this argument.
-    :type obj: basestring or tuple or list
-    :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive
-                 arguments. If train and test is different, then pass a tuple
-                 or list to this argument.
-    :type args: string or picklable object or list or tuple.
-    :return: None
-    :rtype: None
-    """
-
-    def py_data2(files, load_data_module, load_data_object, load_data_args,
-                 **kwargs):
-        data = create_data_config_proto()
-        data.type = 'py2'
-        data.files = files
-        data.load_data_module = load_data_module
-        data.load_data_object = load_data_object
-        data.load_data_args = load_data_args
-        data.async_load_data = False
-        return data
-
-    define_py_data_sources(
-        train_list=train_list,
-        test_list=test_list,
-        module=module,
-        obj=obj,
-        args=args,
-        data_cls=py_data2)
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
deleted file mode 100644
index 69d860d9d..000000000
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import functools
-import inspect
-from .attrs import ParamAttr
-from .activations import TanhActivation
-from paddle.trainer.config_parser import *
-
-__all__ = [
-    'wrap_name_default', 'wrap_param_attr_default', 'wrap_bias_attr_default',
-    'wrap_act_default', 'wrap_param_default'
-]
-
-
-def __default_not_set_callback__(kwargs, name):
-    return name not in kwargs or kwargs[name] is None
-
-
-def wrap_param_default(param_names=None,
-                       default_factory=None,
-                       not_set_callback=__default_not_set_callback__):
-    assert param_names is not None
-    assert isinstance(param_names, list) or isinstance(param_names, tuple)
-    for each_param_name in param_names:
-        assert isinstance(each_param_name, basestring)
-
-    def __impl__(func):
-        @functools.wraps(func)
-        def __wrapper__(*args, **kwargs):
-            if len(args) != 0:
-                argspec = inspect.getargspec(func)
-                num_positional = len(argspec.args)
-                if argspec.defaults:
-                    num_positional -= len(argspec.defaults)
-                if not argspec.varargs and len(args) > num_positional:
-                    logger.fatal(
-                        "Must use keyword arguments for non-positional args")
-            for name in param_names:
-                if not_set_callback(kwargs, name):  # Not set
-                    kwargs[name] = default_factory(func)
-            return func(*args, **kwargs)
-
-        if hasattr(func, 'argspec'):
-            __wrapper__.argspec = func.argspec
-        else:
-            __wrapper__.argspec = inspect.getargspec(func)
-        return __wrapper__
-
-    return __impl__
-
-
-class DefaultNameFactory(object):
-    def __init__(self, name_prefix):
-        self.__counter__ = 0
-        self.__name_prefix__ = name_prefix
-
-    def __call__(self, func):
-        if self.__name_prefix__ is None:
-            self.__name_prefix__ = func.__name__
-        tmp = "__%s_%d__" % (self.__name_prefix__, self.__counter__)
-        self.__check_name__(tmp)
-        self.__counter__ += 1
-        return tmp
-
-    def __check_name__(self, nm):
-        """
-        @TODO(yuyang18): Implement it!
-        @param nm:
-        @return:
-        """
-        pass
-
-    def reset(self):
-        self.__counter__ = 0
-
-
-_name_factories = []
-
-
-def reset_hook():
-    for factory in _name_factories:
-        factory.reset()
-
-
-register_parse_config_hook(reset_hook)
-
-
-def wrap_name_default(name_prefix=None, name_param="name"):
-    """
-    Decorator to set "name" arguments default to "{name_prefix}_{invoke_count}".
-
-    ..  code:: python
-
-        @wrap_name_default("some_name")
-        def func(name=None):
-            print name      # name will never be None. If name is not set,
-                            # name will be "some_name_%d"
-
-    :param name_prefix: name prefix. wrapped function's __name__ if None.
-    :type name_prefix: basestring
-    :return: a decorator to set default name
-    :rtype: callable
-    """
-    factory = DefaultNameFactory(name_prefix)
-    _name_factories.append(factory)
-    return wrap_param_default([name_param], factory)
-
-
-def wrap_param_attr_default(param_names=None, default_factory=None):
-    """
-    Setting Default Parameter Attributes Decorator.
-
-    :param default_factory:
-    :param param_names: Parameter Attribute's Names, list of string
-    :type param_names: list
-    :return: decorator
-    """
-    if param_names is None:
-        param_names = ['param_attr']
-    if default_factory is None:
-        default_factory = lambda _: ParamAttr()
-
-    return wrap_param_default(param_names, default_factory)
-
-
-def wrap_bias_attr_default(param_names=None,
-                           default_factory=None,
-                           has_bias=True):
-    if param_names is None:
-        param_names = ['bias_attr']
-    if default_factory is None:
-        default_factory = lambda _: ParamAttr(initial_std=0., initial_mean=0.)
-
-    def __bias_attr_not_set__(kwargs, name):
-        if has_bias:
-            return name not in kwargs or kwargs[name] is None or \
-                   kwargs[name] == True
-        else:
-            return name in kwargs and kwargs[name] == True
-
-    return wrap_param_default(param_names, default_factory,
-                              __bias_attr_not_set__)
-
-
-def wrap_act_default(param_names=None, act=None):
-    if param_names is None:
-        param_names = ["act"]
-
-    if act is None:
-        act = TanhActivation()
-
-    return wrap_param_default(param_names, lambda _: act)
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
deleted file mode 100644
index 0eeaf7eab..000000000
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ /dev/null
@@ -1,813 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import *
-from default_decorators import *
-
-__all__ = [
-    "evaluator_base",
-    "classification_error_evaluator",
-    "auc_evaluator",
-    "pnpair_evaluator",
-    "precision_recall_evaluator",
-    "ctc_error_evaluator",
-    "chunk_evaluator",
-    "sum_evaluator",
-    "column_sum_evaluator",
-    "value_printer_evaluator",
-    "gradient_printer_evaluator",
-    "maxid_printer_evaluator",
-    "maxframe_printer_evaluator",
-    "seqtext_printer_evaluator",
-    "classification_error_printer_evaluator",
-    "detection_map_evaluator",
-]
-
-
-class EvaluatorAttribute(object):
-    FOR_CLASSIFICATION = 1
-    FOR_REGRESSION = 1 << 1
-    FOR_RANK = 1 << 2
-    FOR_PRINT = 1 << 3
-    FOR_UTILS = 1 << 4
-    FOR_DETECTION = 1 << 5
-
-    KEYS = [
-        "for_classification", "for_regression", "for_rank", "for_print",
-        "for_utils", "for_detection"
-    ]
-
-    @staticmethod
-    def to_key(idx):
-        tmp = 1
-        for i in xrange(0, len(EvaluatorAttribute.KEYS)):
-            if idx == tmp:
-                return EvaluatorAttribute.KEYS[i]
-            else:
-                tmp = (tmp << 1)
-
-
-def evaluator(*attrs):
-    def impl(method):
-        for attr in attrs:
-            setattr(method, EvaluatorAttribute.to_key(attr), True)
-        method.is_evaluator = True
-        return method
-
-    return impl
-
-
-def evaluator_base(input,
-                   type,
-                   label=None,
-                   weight=None,
-                   name=None,
-                   chunk_scheme=None,
-                   num_chunk_types=None,
-                   classification_threshold=None,
-                   positive_label=None,
-                   dict_file=None,
-                   result_file=None,
-                   num_results=None,
-                   delimited=None,
-                   top_k=None,
-                   excluded_chunk_types=None,
-                   overlap_threshold=None,
-                   background_id=None,
-                   evaluate_difficult=None,
-                   ap_type=None):
-    """
-    Evaluator will evaluate the network status while training/testing.
-
-    User can use evaluator by classify/regression job. For example.
-
-    ..  code-block:: python
-
-        classify(prediction, output, evaluator=classification_error_evaluator)
-
-    And user could define evaluator separately as follow.
-
-    ..  code-block:: python
-
-        classification_error_evaluator("ErrorRate", prediction, label)
-
-    The evaluator often contains a name parameter. It will also be printed when
-    evaluating network. The printed information may look like the following.
-
-    ..  code-block:: text
-
-         Batch=200 samples=20000 AvgCost=0.679655 CurrentCost=0.662179 Eval:
-         classification_error_evaluator=0.4486
-         CurrentEval: ErrorRate=0.3964
-
-    :param input: Input layers, a object of LayerOutput or a list of
-                  LayerOutput.
-    :type input: list|LayerOutput
-    :param label: An input layer containing the ground truth label.
-    :type label: LayerOutput|None
-    :param weight: An input layer which is a weight for each sample.
-                   Each evaluator may calculate differently to use this weight.
-    :type weight: LayerOutput.
-    :param top_k: number k in top-k error rate
-    :type top_k: int
-    :param overlap_threshold: In detection tasks to filter detection results
-    :type overlap_threshold: float
-    :param background_id: Identifier of background class
-    :type background_id: int
-    :param evaluate_difficult: Whether to evaluate difficult objects
-    :type evaluate_difficult: bool
-    :param ap_type: How to calculate average persicion
-    :type ap_type: str
-    """
-    # inputs type assertions.
-    assert classification_threshold is None or isinstance(
-        classification_threshold, float)
-    assert positive_label is None or isinstance(positive_label, int)
-    assert num_results is None or isinstance(num_results, int)
-    assert top_k is None or isinstance(top_k, int)
-
-    if not isinstance(input, list):
-        input = [input]
-
-    if label:
-        input.append(label)
-    if weight:
-        input.append(weight)
-
-    Evaluator(
-        name=name,
-        type=type,
-        inputs=[i.name for i in input],
-        chunk_scheme=chunk_scheme,
-        num_chunk_types=num_chunk_types,
-        classification_threshold=classification_threshold,
-        positive_label=positive_label,
-        dict_file=dict_file,
-        result_file=result_file,
-        delimited=delimited,
-        num_results=num_results,
-        top_k=top_k,
-        excluded_chunk_types=excluded_chunk_types,
-        overlap_threshold=overlap_threshold,
-        background_id=background_id,
-        evaluate_difficult=evaluate_difficult,
-        ap_type=ap_type)
-
-
-@evaluator(EvaluatorAttribute.FOR_DETECTION)
-@wrap_name_default()
-def detection_map_evaluator(input,
-                            label,
-                            overlap_threshold=0.5,
-                            background_id=0,
-                            evaluate_difficult=False,
-                            ap_type="11point",
-                            name=None):
-    """
-    Detection mAP Evaluator. It will print mean Average Precision (mAP) for detection.
-
-    The detection mAP Evaluator based on the output of detection_output layer counts
-    the true positive and the false positive bbox and integral them to get the
-    mAP.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval =  detection_map_evaluator(input=det_output,label=lbl)
-
-    :param input: Input layer.
-    :type input: LayerOutput
-    :param label: Label layer.
-    :type label: LayerOutput
-    :param overlap_threshold: The bbox overlap threshold of a true positive.
-    :type overlap_threshold: float
-    :param background_id: The background class index.
-    :type background_id: int
-    :param evaluate_difficult: Whether evaluate a difficult ground truth.
-    :type evaluate_difficult: bool
-    """
-    if not isinstance(input, list):
-        input = [input]
-
-    if label:
-        input.append(label)
-
-    evaluator_base(
-        name=name,
-        type="detection_map",
-        input=input,
-        label=label,
-        overlap_threshold=overlap_threshold,
-        background_id=background_id,
-        evaluate_difficult=evaluate_difficult,
-        ap_type=ap_type)
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def classification_error_evaluator(input,
-                                   label,
-                                   name=None,
-                                   weight=None,
-                                   top_k=None,
-                                   threshold=None):
-    """
-    Classification Error Evaluator. It will print error rate for classification.
-
-    The classification error is:
-
-    ..  math::
-
-        classification\\_error = \\frac{NumOfWrongPredicts}{NumOfAllSamples}
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval =  classification_error_evaluator(input=prob,label=lbl)
-
-    :param name: Evaluator name.
-    :type name: basestring
-    :param input: Input Layer name. The output prediction of network.
-    :type input: LayerOutput
-    :param label: Label layer name.
-    :type label: basestring
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. And will just multiply to NumOfWrongPredicts
-                  and NumOfAllSamples. So, the elements of weight are all one,
-                  then means not set weight. The larger weight it is, the more
-                  important this sample is.
-    :type weight: LayerOutput
-    :param top_k: number k in top-k error rate
-    :type top_k: int
-    :param threshold: The classification threshold.
-    :type threshold: float
-    :return: None.
-    """
-
-    evaluator_base(
-        name=name,
-        type="classification_error",
-        input=input,
-        label=label,
-        weight=weight,
-        top_k=top_k,
-        classification_threshold=threshold, )
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def auc_evaluator(
-        input,
-        label,
-        name=None,
-        weight=None, ):
-    """
-    Auc Evaluator which adapts to binary classification.
-
-    The simple usage:
-
-    .. code-block:: python
-
-       eval = auc_evaluator(input, label)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer name. The output prediction of network.
-    :type input: LayerOutput
-    :param label: Label layer name.
-    :type label: None|basestring
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1].
-    :type weight: LayerOutput
-    """
-    evaluator_base(
-        name=name,
-        type="last-column-auc",
-        input=input,
-        label=label,
-        weight=weight)
-
-
-@evaluator(EvaluatorAttribute.FOR_RANK)
-@wrap_name_default()
-def pnpair_evaluator(
-        input,
-        label,
-        query_id,
-        weight=None,
-        name=None, ):
-    """
-    Positive-negative pair rate Evaluator which adapts to rank task like
-    learning to rank. This evaluator must contain at least three layers.
-
-    The simple usage:
-
-    .. code-block:: python
-
-       eval = pnpair_evaluator(input, label, query_id)
-
-    :param input: Input Layer name. The output prediction of network.
-    :type input: LayerOutput
-    :param label: Label layer name.
-    :type label: LayerOutput
-    :param query_id: Query_id layer name. Query_id indicates that which query
-     each sample belongs to. Its shape should be
-     the same as output of Label layer.
-    :type query_id: LayerOutput
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1] which indicates the weight of each sample.
-                  The default weight of sample is 1 if the weight layer is None.
-                  And the pair weight is the mean of the two samples' weight.
-    :type weight: LayerOutput
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    if not isinstance(input, list):
-        input = [input]
-    if label:
-        input.append(label)
-    if query_id:
-        input.append(query_id)
-    evaluator_base(
-        input=input,
-        type="pnpair",
-        weight=weight,
-        name=name, )
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def precision_recall_evaluator(
-        input,
-        label,
-        positive_label=None,
-        weight=None,
-        name=None, ):
-    """
-    An Evaluator to calculate precision and recall, F1-score.
-    It is adapt to the task with multiple labels.
-
-    - If positive_label=-1, it will print the average precision, recall,
-      F1-score of all labels.
-
-    - If use specify positive_label, it will print the precision, recall,
-      F1-score of this label.
-
-    The simple usage:
-
-    .. code-block:: python
-
-       eval = precision_recall_evaluator(input, label)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer name. The output prediction of network.
-    :type input: LayerOutput
-    :param label: Label layer name.
-    :type label: LayerOutput
-    :param positive_label: The input label layer.
-    :type positive_label: LayerOutput.
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. (TODO, explaination)
-    :type weight: LayerOutput
-    """
-    evaluator_base(
-        name=name,
-        type="precision_recall",
-        input=input,
-        label=label,
-        positive_label=positive_label,
-        weight=weight)
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def ctc_error_evaluator(
-        input,
-        label,
-        name=None, ):
-    """
-    This evaluator is to calculate sequence-to-sequence edit distance.
-
-    The simple usage is :
-
-    .. code-block:: python
-
-       eval = ctc_error_evaluator(input=input, label=lbl)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer. Should be the same as the input for ctc_layer.
-    :type input: LayerOutput
-    :param label: input label, which is a data_layer. Should be the same as the
-                  label for ctc_layer
-    :type label: LayerOutput
-    """
-    evaluator_base(
-        name=name, type="ctc_edit_distance", input=input, label=label)
-
-
-@evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
-@wrap_name_default()
-def chunk_evaluator(
-        input,
-        label,
-        chunk_scheme,
-        num_chunk_types,
-        name=None,
-        excluded_chunk_types=None, ):
-    """
-    Chunk evaluator is used to evaluate segment labelling accuracy for a
-    sequence. It calculates precision, recall and F1 scores for the chunk detection.
-
-    To use chunk evaluator, several concepts need to be clarified firstly.
-
-    * **Chunk type** is the type of the whole chunk and a chunk consists of one or several words.  (For example in NER, ORG for organization name, PER for person name etc.)
-
-    * **Tag type** indicates the position of a word in a chunk. (B for begin, I for inside, E for end, S for single)
-    We can name a label by combining tag type and chunk type. (ie. B-ORG for begining of an organization name)
-
-    The construction of label dictionary should obey the following rules:
-
-    - Use one of the listed labelling schemes. These schemes differ in ways indicating chunk boundry.
-
-    .. code-block:: text
-
-        Scheme    Description
-        plain    Use the same label for the whole chunk.
-        IOB      Two labels for chunk type X, B-X for chunk begining and I-X for chunk inside.
-        IOE      Two labels for chunk type X, E-X for chunk ending and I-X for chunk inside.
-        IOBES    Four labels for chunk type X, B-X for chunk begining, I-X for chunk inside, E-X for chunk end and S-X for single word chunk.
-
-    To make it clear, let's illustrate by an NER example.
-    Assuming that there are three named entity types including ORG, PER and LOC which are called 'chunk type' here,
-    if 'IOB' scheme were used, the label set will be extended to a set including B-ORG, I-ORG, B-PER, I-PER, B-LOC, I-LOC and O,
-    in which B-ORG for begining of ORG and I-ORG for inside of ORG.
-    Prefixes which are called 'tag type' here are added to chunk types and there are two tag types including B and I.
-    Of course, the training data should be labeled accordingly.
-
-    - Mapping is done correctly by the listed equations and assigning protocol.
-
-    The following table are equations to extract tag type and chunk type from a label.
-
-    .. code-block:: text
-
-        tagType = label % numTagType
-        chunkType = label / numTagType
-        otherChunkType = numChunkTypes
-
-    The following table shows the mapping rule between tagType and tag type in each scheme.
-
-    .. code-block:: text
-
-        Scheme Begin Inside End   Single
-        plain  0     -      -     -
-        IOB    0     1      -     -
-        IOE    -     0      1     -
-        IOBES  0     1      2     3
-
-    Continue the NER example, and the label dict should look like this to satify above equations:
-
-    .. code-block:: text
-
-        B-ORG  0
-        I-ORG  1
-        B-PER  2
-        I-PER  3
-        B-LOC  4
-        I-LOC  5
-        O      6
-
-    In this example, chunkType has three values: 0 for ORG, 1 for PER, 2 for LOC, because the scheme is
-    "IOB" so tagType has two values: 0 for B and 1 for I.
-    Here we will use I-LOC to explain the above mapping rules in detail.
-    For I-LOC, the label id is 5, so we can get tagType=1 and chunkType=2, which means I-LOC is a part of NER chunk LOC
-    and the tag is I.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)
-
-
-    :param input: The input layers.
-    :type input: LayerOutput
-    :param label: An input layer containing the ground truth label.
-    :type label: LayerOutput
-    :param chunk_scheme: The labelling schemes support 4 types. It is one of
-                         "IOB", "IOE", "IOBES", "plain". It is required.
-    :type chunk_scheme: basestring
-    :param num_chunk_types: number of chunk types other than "other"
-    :param name: The Evaluator name, it is optional.
-    :type name: basename|None
-    :param excluded_chunk_types: chunks of these types are not considered
-    :type excluded_chunk_types: list of integer|None
-    """
-    evaluator_base(
-        name=name,
-        type="chunk",
-        input=input,
-        label=label,
-        chunk_scheme=chunk_scheme,
-        num_chunk_types=num_chunk_types,
-        excluded_chunk_types=excluded_chunk_types, )
-
-
-@evaluator(EvaluatorAttribute.FOR_UTILS)
-@wrap_name_default()
-def sum_evaluator(
-        input,
-        name=None,
-        weight=None, ):
-    """
-    An Evaluator to sum the result of input.
-
-    The simple usage:
-
-    .. code-block:: python
-
-       eval = sum_evaluator(input)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer name.
-    :type input: LayerOutput
-    :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. (TODO, explaination)
-    :type weight: LayerOutput
-    """
-    evaluator_base(name=name, type="sum", input=input, weight=weight)
-
-
-@evaluator(EvaluatorAttribute.FOR_UTILS)
-@wrap_name_default()
-def column_sum_evaluator(
-        input,
-        name=None,
-        weight=None, ):
-    """
-    This Evaluator is used to sum the last column of input.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = column_sum_evaluator(input, label)
-
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :param input: Input Layer name.
-    :type input: LayerOutput
-    """
-    evaluator_base(
-        name=name, type="last-column-sum", input=input, weight=weight)
-
-
-"""
-The following are printer Evaluators which are usually used to
-print the result, like value or gradient of input layers, the
-results generated in machine translation, the classification error etc.
-"""
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def value_printer_evaluator(
-        input,
-        name=None, ):
-    """
-    This Evaluator is used to print the values of input layers. It contains
-    one or more input layers.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = value_printer_evaluator(input)
-
-    :param input: One or more input layers.
-    :type input: LayerOutput|list
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(name=name, type="value_printer", input=input)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def gradient_printer_evaluator(
-        input,
-        name=None, ):
-    """
-    This Evaluator is used to print the gradient of input layers. It contains
-    one or more input layers.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = gradient_printer_evaluator(input)
-
-    :param input: One or more input layers.
-    :type input: LayerOutput|list
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(name=name, type="gradient_printer", input=input)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def maxid_printer_evaluator(
-        input,
-        num_results=None,
-        name=None, ):
-    """
-    This Evaluator is used to print maximum top k values and their indexes
-    of each row of input layers. It contains one or more input layers.
-    k is specified by num_results.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = maxid_printer_evaluator(input)
-
-    :param input: Input Layer name.
-    :type input: LayerOutput|list
-    :param num_results: This number is used to specify the top k numbers.
-                        It is 1 by default.
-    :type num_results: int.
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(
-        name=name, type="max_id_printer", input=input, num_results=num_results)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def maxframe_printer_evaluator(
-        input,
-        num_results=None,
-        name=None, ):
-    """
-    This Evaluator is used to print the top k frames of each input layers.
-    The input layers should contain sequences info or sequences type.
-    k is specified by num_results.
-    It contains one or more input layers.
-
-    Note:
-        The width of each frame is 1.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = maxframe_printer_evaluator(input)
-
-    :param input: Input Layer name.
-    :type input: LayerOutput|list
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(
-        name=name,
-        type="max_frame_printer",
-        input=input,
-        num_results=num_results)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def seqtext_printer_evaluator(
-        input,
-        result_file,
-        id_input=None,
-        dict_file=None,
-        delimited=None,
-        name=None, ):
-    """
-    Sequence text printer will print text according to index matrix and a
-    dictionary. There can be multiple input to this layer:
-
-    1. If there is no id_input, the input must be a matrix containing
-    the sequence of indices;
-
-    2. If there is id_input, it should be ids, and interpreted as sample ids.
-
-    The output format will be:
-
-    1. sequence without sub-sequence, and there is probability.
-
-    .. code-block:: python
-
-         id \t prob space_seperated_tokens_from_dictionary_according_to_seq
-
-    2. sequence without sub-sequence, and there is not probability.
-
-    .. code-block:: python
-
-         id \t space_seperated_tokens_from_dictionary_according_to_seq
-
-    3. sequence with sub-sequence, and there is not probability.
-
-    .. code-block:: python
-
-         id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
-         \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
-         ...
-
-    Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
-    with maxid (when generating) as an input.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = seqtext_printer_evaluator(input=maxid_layer,
-                                        id_input=sample_id,
-                                        dict_file=dict_file,
-                                        result_file=result_file)
-
-    :param input: Input Layer name.
-    :type input: LayerOutput|list
-    :param result_file: Path of the file to store the generated results.
-    :type result_file: basestring
-    :param id_input: Index of the input sequence, and the specified index will
-                     be prited in the gereated results. This an optional
-                     parameter.
-    :type id_input: LayerOutput
-    :param dict_file: Path of dictionary. This is an optional parameter.
-                      Every line is a word in the dictionary with
-                      (line number - 1) as the word index.
-                      If this parameter is set to None, or to an empty string,
-                      only word index are printed in the generated results.
-    :type dict_file: basestring
-    :param delimited: Whether to use space to separate output tokens.
-                Default is True. No space is added if set to False.
-    :type delimited: bool
-    :param name: Evaluator name.
-    :type name: None|basestring
-    :return: The seq_text_printer that prints the generated sequence to a file.
-    :rtype: evaluator
-    """
-    assert isinstance(result_file, basestring)
-    if id_input is None:
-        inputs = [input]
-    else:
-        inputs = [id_input, input]
-        input.parents.append(id_input)
-
-    evaluator_base(
-        name=name,
-        type="seq_text_printer",
-        input=inputs,
-        dict_file=dict_file,
-        result_file=result_file,
-        delimited=delimited)
-
-
-@evaluator(EvaluatorAttribute.FOR_PRINT)
-@wrap_name_default()
-def classification_error_printer_evaluator(
-        input,
-        label,
-        threshold=0.5,
-        name=None, ):
-    """
-    This Evaluator is used to print the classification error of each sample.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       eval = classification_error_printer_evaluator(input)
-
-    :param input: Input layer.
-    :type input: LayerOutput
-    :param label: Input label layer.
-    :type label: LayerOutput
-    :param name: Evaluator name.
-    :type name: None|basestring
-    """
-    evaluator_base(
-        name=name,
-        type="classification_error_printer",
-        input=input,
-        label=label,
-        classification_threshold=threshold)
diff --git a/python/paddle/trainer_config_helpers/layer_math.py b/python/paddle/trainer_config_helpers/layer_math.py
deleted file mode 100644
index ee84188ba..000000000
--- a/python/paddle/trainer_config_helpers/layer_math.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .layers import LayerOutput, mixed_layer, identity_projection, \
-    slope_intercept_layer, scaling_layer, repeat_layer
-from .attrs import is_compatible_with
-from .default_decorators import *
-import activations as act
-from paddle.trainer.config_parser import logger
-
-__all__ = []
-
-
-def register_unary_math_op(op_name, act):
-    def op(input, name=None):
-        return mixed_layer(
-            input=[identity_projection(input=input)], name=name, act=act)
-
-    op = wrap_name_default(op_name)(op)
-    op.__doc__ = type(act).__doc__
-    globals()[op_name] = op
-    __all__.append(op_name)
-
-
-register_unary_math_op('exp', act.ExpActivation())
-register_unary_math_op('log', act.LogActivation())
-register_unary_math_op('abs', act.AbsActivation())
-register_unary_math_op('sigmoid', act.SigmoidActivation())
-register_unary_math_op('tanh', act.TanhActivation())
-register_unary_math_op('square', act.SquareActivation())
-register_unary_math_op('relu', act.ReluActivation())
-register_unary_math_op('sqrt', act.SqrtActivation())
-register_unary_math_op('reciprocal', act.ReciprocalActivation())
-
-
-def add(layeroutput, other):
-    if is_compatible_with(other, float):
-        return slope_intercept_layer(input=layeroutput, intercept=other)
-    if not isinstance(other, LayerOutput):
-        logger.fatal("LayerOutput can only be added with"
-                     " another LayerOutput or a number")
-    if layeroutput.size == other.size:
-        return mixed_layer(input=[
-            identity_projection(input=layeroutput),
-            identity_projection(input=other)
-        ])
-    if other.size != 1 and layeroutput.size != 1:
-        logger.fatal("Two LayerOutput can be added only if they have equal size"
-                     " or one of their sizes is 1. sizes are %s and %s" %
-                     (layeroutput.size, other.size))
-    elif layeroutput.size == 1:
-        tmp = layeroutput
-        layeroutput = other
-        other = tmp
-    other = repeat_layer(other, layeroutput.size)
-    return mixed_layer(input=[
-        identity_projection(input=layeroutput), identity_projection(input=other)
-    ])
-
-
-LayerOutput.__radd__ = add
-LayerOutput.__add__ = add
-
-
-def sub(layeroutput, other):
-    if is_compatible_with(other, float):
-        return slope_intercept_layer(input=layeroutput, intercept=-other)
-    if not isinstance(other, LayerOutput):
-        logger.fatal("LayerOutput can only be subtracted with"
-                     " another Layeroutput or a number")
-    neg = slope_intercept_layer(input=other, slope=-1.0)
-    return add(layeroutput, neg)
-
-
-LayerOutput.__sub__ = sub
-
-
-def rsub(layeroutput, other):
-    neg = slope_intercept_layer(input=layeroutput, slope=-1.0)
-    return add(neg, other)
-
-
-LayerOutput.__rsub__ = rsub
-
-
-def mul(layeroutput, other):
-    if is_compatible_with(other, float):
-        return slope_intercept_layer(input=layeroutput, slope=other)
-    if not isinstance(other, LayerOutput):
-        logger.fatal("LayerOutput can only be multiplied with"
-                     " another Layeroutput or a number")
-    elif layeroutput.size == 1:
-        return scaling_layer(input=other, weight=layeroutput)
-    elif other.size == 1:
-        return scaling_layer(input=layeroutput, weight=other)
-    else:
-        logger.fatal("At least one of the operand of '*' must be a number"
-                     " or a LayerOutput with size=1")
-
-
-LayerOutput.__mul__ = mul
-LayerOutput.__rmul__ = mul
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
deleted file mode 100644
index ee34c1573..000000000
--- a/python/paddle/trainer_config_helpers/layers.py
+++ /dev/null
@@ -1,7610 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import functools
-import collections
-import inspect
-
-import paddle.trainer.config_parser as cp
-from paddle.trainer.config_parser import *
-from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
-    ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
-from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
-    CudnnAvgPooling, CudnnAvgInclPadPooling, CudnnMaxPooling
-from .attrs import *
-from .default_decorators import *
-
-try:
-    import cPickle as pickle
-except ImportError:
-    import six.moves.cPickle as pickle
-import copy
-
-__all__ = [
-    'full_matrix_projection',
-    'AggregateLevel',
-    'ExpandLevel',
-    'identity_projection',
-    'dotmul_projection',
-    'dotmul_operator',
-    'repeat_layer',
-    'seq_reshape_layer',
-    'table_projection',
-    'mixed_layer',
-    'data_layer',
-    'embedding_layer',
-    'fc_layer',
-    'grumemory',
-    'pooling_layer',
-    'lstmemory',
-    'last_seq',
-    'first_seq',
-    'cos_sim',
-    'l2_distance_layer',
-    'hsigmoid',
-    'conv_projection',
-    'square_error_cost',
-    'regression_cost',
-    'classification_cost',
-    'LayerOutput',
-    'img_conv_layer',
-    'img_pool_layer',
-    'batch_norm_layer',
-    'img_cmrnorm_layer',
-    'addto_layer',
-    'concat_layer',
-    'seq_concat_layer',
-    'lstm_step_layer',
-    'recurrent_group',
-    'memory',
-    'StaticInput',
-    'expand_layer',
-    'scaling_layer',
-    'scaling_projection',
-    'power_layer',
-    'interpolation_layer',
-    'bilinear_interp_layer',
-    'trans_layer',
-    'rotate_layer',
-    'sum_to_one_norm_layer',
-    'row_l2_norm_layer',
-    'get_output_layer',
-    'LayerType',
-    'context_projection',
-    'beam_search',
-    'maxid_layer',
-    'GeneratedInput',
-    'SubsequenceInput',
-    'gru_step_layer',
-    'gru_step_naive_layer',
-    'recurrent_layer',
-    'BaseGeneratedInput',
-    'conv_operator',
-    'conv_shift_layer',
-    'tensor_layer',
-    'selective_fc_layer',
-    'sampling_id_layer',
-    'slope_intercept_layer',
-    'trans_full_matrix_projection',
-    'linear_comb_layer',
-    'convex_comb_layer',
-    'ctc_layer',
-    'warp_ctc_layer',
-    'crf_layer',
-    'crf_decoding_layer',
-    'nce_layer',
-    'cross_entropy_with_selfnorm',
-    'cross_entropy',
-    'BeamInput',
-    'cross_entropy_over_beam',
-    'multi_binary_label_cross_entropy',
-    'sum_cost',
-    'rank_cost',
-    'lambda_cost',
-    'huber_regression_cost',
-    'huber_classification_cost',
-    'block_expand_layer',
-    'maxout_layer',
-    'dot_prod_layer',
-    'out_prod_layer',
-    'printer_layer',
-    'print_layer',
-    'priorbox_layer',
-    'cross_channel_norm_layer',
-    'multibox_loss_layer',
-    'detection_output_layer',
-    'roi_pool_layer',
-    'spp_layer',
-    'pad_layer',
-    'eos_layer',
-    'smooth_l1_cost',
-    'layer_support',
-    'multiplex_layer',
-    'row_conv_layer',
-    'dropout_layer',
-    'prelu_layer',
-    'switch_order_layer',
-    'gated_unit_layer',
-    'crop_layer',
-    'sub_nested_seq_layer',
-    'clip_layer',
-    'slice_projection',
-    'seq_slice_layer',
-    'kmax_seq_score_layer',
-    'img_pool3d_layer',
-    'scale_shift_layer',
-    'img_conv3d_layer',
-    'resize_layer',
-    'sub_seq_layer',
-    'scale_sub_region_layer',
-    'upsample_layer',
-    'factorization_machine',
-]
-
-
-class LayerType(object):
-    """
-    Layer type enumerations.
-    """
-
-    DATA = 'data'
-    MIXED_LAYER = 'mixed'
-    LSTMEMORY = 'lstmemory'
-    GRUMEMORY = 'gated_recurrent'
-    SEQUENCE_LAST_INSTANCE = 'seqlastins'
-    SEQUENCE_FIRST_INSTANCE = 'seqfirstins'
-    SEQUENCE_RESHAPE = 'seqreshape'
-    POOLING_MAX = 'max'
-    POOLING_AVG = 'average'
-    UPSAMPLE_LAYER = 'upsample'
-    FC_LAYER = 'fc'
-    COST = 'cost'
-    COSINE_SIM_VEC = 'cos_vm'
-    COSINE_SIM = 'cos'
-    L2_DISTANCE = 'l2_distance'
-    HSIGMOID = 'hsigmoid'
-    CONV_LAYER = 'conv'
-    CONVTRANS_LAYER = 'convt'
-    EXCONV_LAYER = 'exconv'
-    EXCONVTRANS_LAYER = 'exconvt'
-    CUDNNCONV_LAYER = 'cudnn_conv'
-    CUDNNCONVTRANS_LAYER = 'cudnn_convt'
-    POOL_LAYER = 'pool'
-    POOL3D_LAYER = 'pool3d'
-    BATCH_NORM_LAYER = 'batch_norm'
-    NORM_LAYER = 'norm'
-    SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
-    ROW_L2_NORM_LAYER = 'row_l2_norm'
-    ADDTO_LAYER = 'addto'
-
-    CONCAT_LAYER = 'concat'
-    CONCAT_PROJ_LAYER = 'concat2'
-    SEQUENCE_CONCAT_LAYER = 'seqconcat'
-
-    LSTM_STEP_LAYER = 'lstm_step'
-    GRU_STEP_LAYER = 'gru_step'
-    GET_OUTPUT_LAYER = 'get_output'
-
-    EXPAND_LAYER = 'expand'
-    INTERPOLATION_LAYER = 'interpolation'
-    BILINEAR_INTERP_LAYER = 'bilinear_interp'
-    POWER_LAYER = 'power'
-    SCALING_LAYER = 'scaling'
-    TRANS_LAYER = 'trans'
-    ROTATE_LAYER = 'rotate'
-    DOT_PROD_LAYER = 'dot_prod'
-    OUT_PROD_LAYER = 'out_prod'
-    FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
-
-    MEMORY = 'memory'
-    MAXID_LAYER = 'maxid'
-    EOSID_LAYER = 'eos_id'
-    RECURRENT_LAYER = 'recurrent'
-
-    CONV_SHIFT_LAYER = "conv_shift"
-    TENSOR_LAYER = "tensor"
-    SEL_FC_LAYER = "selective_fc"
-    SAMPLING_ID_LAYER = "sampling_id"
-    SLOPE_INTERCEPT_LAYER = "slope_intercept"
-    LINEAR_COMBINATION_LAYER = "convex_comb"
-    BLOCK_EXPAND = "blockexpand"
-    MAXOUT = "maxout"
-    SPP_LAYER = "spp"
-    PAD_LAYER = "pad"
-    MULTIPLEX_LAYER = "multiplex"
-    ROW_CONV_LAYER = "row_conv"
-
-    PRINT_LAYER = 'print'
-    PRIORBOX_LAYER = 'priorbox'
-    MULTIBOX_LOSS_LAYER = 'multibox_loss'
-    DETECTION_OUTPUT_LAYER = 'detection_output'
-    ROI_POOL_LAYER = 'roi_pool'
-
-    CTC_LAYER = 'ctc'
-    WARP_CTC_LAYER = 'warp_ctc'
-    CRF_LAYER = 'crf'
-    CRF_DECODING_LAYER = 'crf_decoding'
-    NCE_LAYER = 'nce'
-
-    CONV3D_LAYER = 'conv3d'
-    DECONV3D_LAYER = 'deconv3d'
-
-    RANK_COST = 'rank-cost'
-    LAMBDA_COST = 'lambda_cost'
-    HUBER_REGRESSION = 'huber_regression'
-    HUBER_CLASSIFICATION = 'huber_classification'
-    CROSS_ENTROPY = 'multi-class-cross-entropy'
-    CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
-    CROSS_ENTROPY_OVER_BEAM = 'cross_entropy_over_beam'
-    SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
-    MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy'
-    SUM_COST = 'sum_cost'
-    SMOOTH_L1 = 'smooth_l1'
-
-    PRELU = 'prelu'
-    SWITCH_ORDER_LAYER = 'switch_order'
-    CROP_LAYER = 'crop'
-    SUB_NESTED_SEQ = 'sub_nested_seq'
-    CLIP_LAYER = 'clip'
-    SEQ_SLICE = 'seq_slice'
-
-    KMAX_SEQ_SCORE = 'kmax_seq_score'
-    SCALE_SHIFT_LAYER = 'scale_shift'
-
-    RESIZE = 'resize'
-    SUB_SEQ_LAYER = 'subseq'
-
-    SCALE_SUB_REGION_LAYER = 'scale_sub_region'
-
-    FACTORIZATION_MACHINE = 'factorization_machine'
-
-    @staticmethod
-    def is_layer_type(type_name):
-        """
-        Whether type_name is a layer type.
-
-        :param type_name: layer type name. Because layer type enumerations are
-                          strings.
-        :type type_name: basestring
-        :return: True if is a layer_type
-        :rtype: bool
-        """
-        for key in dir(LayerType):
-            if key.isupper():
-                att = getattr(LayerType, key)
-                if isinstance(att, basestring) and type_name == att:
-                    return True
-        return False
-
-
-class AggregateLevel(object):
-    """
-    PaddlePaddle supports three sequence types:
-
-    - :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
-    - :code:`SequenceType.SEQUENCE` means the sample is a sequence.
-    - :code:`SequenceType.SUB_SEQUENCE` means the sample is a nested sequence,
-      each timestep of which is also a sequence.
-
-    Accordingly, AggregateLevel supports two modes:
-
-    - :code:`AggregateLevel.TO_NO_SEQUENCE` means the aggregation acts on each
-      timestep of a sequence, both :code:`SUB_SEQUENCE` and :code:`SEQUENCE` will
-      be aggregated to :code:`NO_SEQUENCE`.
-
-    - :code:`AggregateLevel.TO_SEQUENCE` means the aggregation acts on each
-      sequence of a nested sequence, :code:`SUB_SEQUENCE` will be aggregated to
-      :code:`SEQUENCE`.
-    """
-    TO_NO_SEQUENCE = 'non-seq'
-    TO_SEQUENCE = 'seq'
-    # compatible with previous configuration
-    EACH_TIMESTEP = TO_NO_SEQUENCE
-    EACH_SEQUENCE = TO_SEQUENCE
-
-
-class LayerOutput(object):
-    """
-    LayerOutput is output for layer function. It is used internally by several
-    reasons.
-
-    - Check layer connection make sense.
-
-        - FC(Softmax) => Cost(MSE Error) is not good for example.
-
-    - Tracking layer connection.
-
-    - Pass to layer methods as input.
-
-    :param name: Layer output name.
-    :type name: basestring
-    :param layer_type: Current Layer Type. One of LayerType enumeration.
-    :type layer_type: basestring
-    :param activation: Layer Activation.
-    :type activation: BaseActivation.
-    :param parents: Layer's parents.
-    :type parents: list | tuple | collections.Sequence
-    """
-
-    def __init__(self,
-                 name,
-                 layer_type,
-                 parents=None,
-                 activation=None,
-                 num_filters=None,
-                 img_norm_type=None,
-                 size=None,
-                 outputs=None,
-                 reverse=None):
-        assert isinstance(name, basestring)
-        assert isinstance(layer_type, basestring)
-        assert size is not None
-        assert LayerType.is_layer_type(layer_type)
-        self.name = name
-        self.full_name = MakeLayerNameInSubmodel(name)
-        self.layer_type = layer_type
-        if parents is not None and type(parents) != list:
-            parents = [parents]
-        self.parents = [] if parents is None else parents
-        self.activation = activation
-        self.num_filters = num_filters
-        self.img_norm_type = img_norm_type
-        self.size = size
-        if outputs is None:
-            outputs = ['default']
-        self.outputs = outputs
-        self.reverse = reverse
-
-    @property
-    def width(self):
-        return cp.g_layer_map[self.full_name].width
-
-    @property
-    def height(self):
-        return cp.g_layer_map[self.full_name].height
-
-    @property
-    def depth(self):
-        return cp.g_layer_map[self.full_name].depth
-
-    def set_input(self, input):
-        """
-        Set the input for a memory layer. Can only be used for memory layer
-        """
-        assert isinstance(input, LayerOutput)
-        assert self.layer_type == LayerType.MEMORY
-        SetMemoryInput(self.name, input.name)
-
-
-ERROR_CLIPPING = 'error_clipping_threshold'
-DROPOUT = 'drop_rate'
-DEVICE = 'device'
-
-
-def layer_support(*attrs):
-    attrs_list = list(attrs)
-    attrs_list.append(DEVICE)
-
-    def decorator(method):
-        @functools.wraps(method)
-        def wrapper(*args, **kwargs):
-            for attr in attrs_list:
-                for each in args:
-                    if isinstance(each, ExtraLayerAttribute):
-                        setattr(each, '_'.join(['can', attr]), True)
-                for key in kwargs:
-                    val = kwargs[key]
-                    if isinstance(val, ExtraLayerAttribute):
-                        setattr(val, '_'.join(['can', attr]), True)
-            for each in args:
-                if isinstance(each, ExtraLayerAttribute):
-                    each.check(method.__name__)
-            for key in kwargs:
-                val = kwargs[key]
-                if isinstance(val, ExtraLayerAttribute):
-                    val.check(method.__name__)
-            return method(*args, **kwargs)
-
-        if hasattr(method, 'argspec'):
-            wrapper.argspec = method.argspec
-        else:
-            wrapper.argspec = inspect.getargspec(method)
-
-        return wrapper
-
-    return decorator
-
-
-@wrap_param_attr_default()
-def full_matrix_projection(input, size=0, param_attr=None):
-    """
-    Full Matrix Projection. It performs full matrix multiplication.
-
-    ..  math::
-        out.row[i] += in.row[i] * weight
-
-    There are two styles of usage.
-
-    1. When used in mixed_layer like this, you can only set the input:
-
-    .. code-block:: python
-
-       with mixed_layer(size=100) as m:
-           m += full_matrix_projection(input=layer)
-
-    2. When used as an independent object like this, you must set the size:
-
-    .. code-block:: python
-
-       proj = full_matrix_projection(input=layer,
-                                     size=100,
-                                     param_attr=ParamAttr(name='_proj'))
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param size: The dimension of this layer.
-    :type size: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: FullMatrixProjection Object.
-    :rtype: FullMatrixProjection
-    """
-    proj = FullMatrixProjection(
-        input_layer_name=input.name, size=size, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-@wrap_param_attr_default()
-def trans_full_matrix_projection(input, size=0, param_attr=None):
-    """
-    Different from full_matrix_projection, this projection performs matrix
-    multiplication, using the transpose of weight.
-
-    ..  math::
-        out.row[i] += in.row[i] * w^\mathrm{T}
-
-    :math:`w^\mathrm{T}` means the transpose of weight.
-    The simply usage is:
-
-    .. code-block:: python
-
-       proj = trans_full_matrix_projection(input=layer,
-                                           size=100,
-                                           param_attr=ParamAttr(
-                                                name='_proj',
-                                                initial_mean=0.0,
-                                                initial_std=0.01))
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
-    :type size: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: TransposedFullMatrixProjection Object.
-    :rtype: TransposedFullMatrixProjection
-    """
-    proj = TransposedFullMatrixProjection(
-        input_layer_name=input.name, size=size, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-@wrap_param_attr_default()
-def table_projection(input, size=0, param_attr=None):
-    """
-    Table Projection. It selects rows from parameter where row\_id
-    is in input\_ids.
-
-    .. math::
-       out.row[i] += table.row[ids[i]]
-
-    where :math:`out` is output, :math:`table` is parameter, :math:`ids` is input\_ids,
-    and :math:`i` is row\_id.
-
-    There are two styles of usage.
-
-    1. When used in mixed_layer like this, you can only set the input:
-
-    .. code-block:: python
-
-       with mixed_layer(size=100) as m:
-           m += table_projection(input=layer)
-
-    2. When used as an independent object like this, you must set the size:
-
-    .. code-block:: python
-
-       proj = table_projection(input=layer,
-                               size=100,
-                               param_attr=ParamAttr(name='_proj'))
-
-
-    :param input: The input of this layer, which must contains id fields.
-    :type input: LayerOutput
-    :param size: The dimension of the output.
-    :type size: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: TableProjection Object.
-    :rtype: TableProjection
-    """
-    proj = TableProjection(
-        input_layer_name=input.name, size=size, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-def identity_projection(input, offset=None, size=None):
-    """
-    1. If offset=None, it performs IdentityProjection as follows:
-
-    .. math::
-       out.row[i] += in.row[i]
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = identity_projection(input=layer)
-
-
-    2. If offset!=None, It executes IdentityOffsetProjection and takes the
-       elements of the input in the range [offset, offset+size) as output.
-
-    .. math::
-       out.row[i] += in.row[i + \\textrm{offset}]
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = identity_projection(input=layer,
-                                  offset=10)
-
-    Note that neither of the projections have trainable parameter.
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param offset: The offset from the start of the input. The input's
-                   elements in the range [offset, offset+size) will be
-                   taken as output. If this parameter is not set or set
-                   to None, the output will be the same as the input.
-    :type offset: int
-    :param size: The dimension of this layer. It will be neglected
-                 when offset is None or not set.
-    :type size: int
-    :return: IdentityProjection or IdentityOffsetProjection object
-    :rtype: IdentityProjection | IdentityOffsetProjection
-    """
-    if offset is None:
-        proj = IdentityProjection(input_layer_name=input.name)
-        proj.origin = input
-    else:
-        if size is None:
-            size = input.size - offset
-        proj = IdentityOffsetProjection(
-            input_layer_name=input.name, offset=offset, size=size)
-        proj.origin = input
-    return proj
-
-
-def slice_projection(input, slices):
-    """
-    slice_projection slices the input value into multiple parts,
-    then selects and merges some of them into a new output.
-
-    .. math::
-       output = [input.slices()]
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
-
-    Note that slice_projection has no trainable parameter.
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param slices: A list of start and end offsets of each slice.
-    :type slices: list of tuple
-    :return: SliceProjection object.
-    :rtype: SliceProjection
-    """
-    assert len(slices) >= 1
-    start = 0
-    for i in xrange(len(slices)):
-        assert len(slices[i]) == 2
-        # The start position of the next slice needs to be greater than
-        # or equal to the end position of the previous slice.
-        assert slices[i][0] >= start
-        assert slices[i][1] >= slices[i][0]
-        start = slices[i][1]
-    proj = SliceProjection(input_layer_name=input.name, slices=slices)
-    proj.origin = input
-    return proj
-
-
-@wrap_param_attr_default()
-def scaling_projection(input, param_attr=None):
-    """
-    scaling_projection multiplies the input with a scalar parameter.
-
-    .. math::
-       out += w * in
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = scaling_projection(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: ScalingProjection object.
-    :rtype: ScalingProjection
-    """
-    proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-@wrap_param_attr_default()
-def dotmul_projection(input, param_attr=None):
-    """
-    DotMulProjection takes a layer as input and performs
-    element-wise multiplication with weight.
-
-    ..  math::
-        out.row[i] += in.row[i] .* weight
-
-    where :math:`.*` means element-wise multiplication.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = dotmul_projection(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: DotMulProjection object.
-    :rtype: DotMulProjection
-    """
-    proj = DotMulProjection(
-        input_layer_name=input.name, size=input.size, **param_attr.attr)
-    proj.origin = input
-    return proj
-
-
-def dotmul_operator(a=None, b=None, scale=1, **kwargs):
-    """
-    DotMulOperator takes two inputs and performs element-wise multiplication:
-
-    .. math::
-       out.row[i] += scale * (a.row[i] .* b.row[i])
-
-    where :math:`.*` means element-wise multiplication, and
-    scale is a config scalar, its default value is 1.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
-
-    :param a: The first input of this layer.
-    :type a: LayerOutput
-    :param b: The second input of this layer.
-    :type b: LayerOutput
-    :param scale: A scalar to scale the product. Its default value is 1.
-    :type scale: float
-    :return: DotMulOperator object.
-    :rtype: DotMulOperator
-    """
-    if 'x' in kwargs or 'y' in kwargs:
-        logger.warning('x and y arguments for dotmul_operator is deprecated. '
-                       'Please use a and b as parameter.')
-    a = kwargs.get('x', a)  # For Backward capacity.
-    b = kwargs.get('y', b)
-    assert isinstance(a, LayerOutput)
-    assert isinstance(b, LayerOutput)
-    if a.size is not None and b.size is not None:
-        assert a.size == b.size
-
-    op = DotMulOperator(input_layer_names=[a.name, b.name], scale=scale)
-    op.origin = [a, b]
-    return op
-
-
-@wrap_bias_attr_default(['padding_attr'])
-def context_projection(input,
-                       context_len,
-                       context_start=None,
-                       padding_attr=False):
-    """
-    Context Projection.
-
-    It just reorganizes input sequence, combines "context_len" elements of the
-    sequence to one context from context_start. "context_start" will be set to
-    -(context_len - 1) / 2 by default. When context position is out of sequence
-    length, padding will be filled as zero if padding_attr = False, otherwise
-    it is trainable.
-
-    For example, origin sequence is [A B C D E F G], context len is 3, padding_attr
-    is not set, then after context projection, sequence will
-    be [ 0AB ABC BCD CDE DEF EFG FG0 ].
-
-    :param input: The input of this layer, which should be a sequence.
-    :type input: LayerOutput
-    :param context_len: The length of the context.
-    :type context_len: int
-    :param context_start: The start position of the context. The default value is
-                          -(context_len - 1)/2
-    :type context_start: int
-    :param padding_attr: Parameter attribute of the padding. If the parameter is
-                         set to False, padding will be zero. In other cases, the
-                         padding is trainable, and its parameter attribute is set
-                         by this parameter.
-    :type padding_attr: bool | ParameterAttribute
-    :return: Projection object.
-    :rtype: Projection
-    """
-    context_start = -(
-        context_len - 1) / 2 if context_start is None else context_start
-
-    extra_dict = dict()
-    trainable = isinstance(padding_attr, ParameterAttribute)
-    if trainable:
-        extra_dict = padding_attr.attr
-
-    proj = ContextProjection(
-        input_layer_name=input.name,
-        context_length=context_len,
-        context_start=context_start,
-        trainable_padding=trainable,
-        **extra_dict)
-    proj.origin = input
-    return proj
-
-
-class MixedLayerType(LayerOutput):
-    """
-    The internal object for trainer_helpers.
-    """
-
-    class AddToSealedMixedLayerException(Exception):
-        def __init__(self):
-            Exception.__init__(self)
-
-    def __init__(self, name, size, act, bias_attr, layer_attr, parents=None):
-        """
-        :param name: The name of this layer.
-        :type name: basestring
-        :param size: The dimension of this layer.
-        :type size: int
-        :param act: Activation type.
-        :type act: BaseActivation
-        :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                          whose type is not ParameterAttribute, no bias is defined. If the
-                          parameter is set to True, the bias is initialized to zero.
-        :type bias_attr: ParameterAttribute | None | bool | Any
-        :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                           details.
-        :type layer_attr: ExtraLayerAttribute | None
-        """
-        LayerOutput.__init__(
-            self,
-            name,
-            LayerType.MIXED_LAYER,
-            parents,
-            size=size,
-            activation=act)
-        self.bias_attr = bias_attr
-        self.layer_attr = layer_attr
-        self.inputs = []
-        self.finalized = False
-
-    def __iadd__(self, other):
-        """
-        + += operator
-        :param other: Other projection.
-        :type other: Projection
-        :return: self.
-        :rtype: MixedLayerType
-        """
-        if not self.finalized:
-            assert isinstance(other, Projection) or isinstance(other, Operator)
-            self.inputs.append(other)
-            if isinstance(other, Projection):
-                self.parents.append(other.origin)
-            else:
-                self.parents.extend(other.origin)
-            return self
-        else:
-            raise MixedLayerType.AddToSealedMixedLayerException()
-
-    def __enter__(self):
-        assert len(self.inputs) == 0
-        return self
-
-    def __exit__(self, exc_type, exc_value, tb):
-        if exc_value is not None:
-            raise exc_value
-        assert len(self.inputs) != 0
-        ml = MixedLayer(
-            name=self.name,
-            size=self.size,
-            active_type=self.activation.name,
-            bias=ParamAttr.to_bias(self.bias_attr),
-            inputs=self.inputs,
-            **ExtraLayerAttribute.to_kwargs(self.layer_attr))
-        # update the size which might be computed inside MixedLayer
-        # according to the operator's output size
-        self.size = ml.config.size
-        self.finalized = True
-
-
-@wrap_name_default("mixed")
-@wrap_act_default(act=LinearActivation())
-@wrap_bias_attr_default(has_bias=False)
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def mixed_layer(size=0,
-                input=None,
-                name=None,
-                act=None,
-                bias_attr=False,
-                layer_attr=None):
-    """
-    Mixed Layer. A mixed layer will add all inputs together, then activate the sum.
-    Each input is a projection or operator.
-
-    There are two styles of usages.
-
-    1. When the parameter input is not set, use mixed_layer like this:
-
-    .. code-block:: python
-
-       with mixed_layer(size=256) as m:
-           m += full_matrix_projection(input=layer1)
-           m += identity_projection(input=layer2)
-
-    2. You can also set all inputs when invoke mixed_layer as follows:
-
-    .. code-block:: python
-
-       m = mixed_layer(size=256,
-                       input=[full_matrix_projection(input=layer1),
-                              full_matrix_projection(input=layer2)])
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param size: The dimension of this layer.
-    :type size: int
-    :param input: The input of this layer. It is an optional parameter.
-    :param act: Activation Type. LinearActivation is the default activation.
-    :type act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: MixedLayerType object.
-    :rtype: MixedLayerType
-    """
-
-    if input is None:
-        return MixedLayerType(name, size, act, bias_attr, layer_attr)
-    else:
-        with mixed_layer(
-                name=name,
-                size=size,
-                act=act,
-                bias_attr=bias_attr,
-                layer_attr=layer_attr) as m:
-            if isinstance(input, collections.Sequence):
-                for each in input:
-                    m += each
-            else:
-                m += input
-        return m
-
-
-@layer_support()
-def data_layer(name, size, depth=None, height=None, width=None,
-               layer_attr=None):
-    """
-    Define DataLayer For NeuralNetwork.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        data = data_layer(name="input", size=1000)
-
-    :param name: The name of this layer.
-    :type name: basestring
-    :param size: The dimension of this data layer.
-    :type size: int
-    :param height: The height of the input image data.
-    :type height: int | None
-    :param width: The width of the input image data.
-    :type width: int | None
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        type=LayerType.DATA,
-        name=name,
-        size=size,
-        depth=depth,
-        height=height,
-        width=width,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    if depth is None:
-        depth = 1
-    num_filters = None
-    if height is not None and width is not None:
-        num_filters = size / (width * height * depth)
-        assert num_filters * width * height * depth == size, \
-                "size=%s width=%s height=%s depth=%s" % (size, width, height, depth)
-
-    return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)
-
-
-@wrap_name_default("embedding")
-@wrap_param_attr_default()
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
-    """
-    Define a embedding Layer.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer, whose type must be Index Data.
-    :type input: LayerOutput
-    :param size: The dimension of the embedding vector.
-    :type size: int
-    :param param_attr: The embedding parameter attribute. See ParameterAttribute
-                      for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    with mixed_layer(
-            name=name,
-            size=size,
-            act=LinearActivation(),
-            bias_attr=False,
-            layer_attr=layer_attr) as mix:
-        mix += table_projection(input=input, size=size, param_attr=param_attr)
-    return mix
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default()
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def fc_layer(input,
-             size,
-             act=None,
-             name=None,
-             param_attr=None,
-             bias_attr=None,
-             layer_attr=None):
-    """
-    The fully connected layer.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       fc = fc_layer(input=layer,
-                     size=1024,
-                     act=LinearActivation(),
-                     bias_attr=False)
-
-    which is equal to:
-
-    .. code-block:: python
-
-       with mixed_layer(size=1024) as fc:
-           fc += full_matrix_projection(input=layer)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput | list | tuple
-    :param size: The dimension of this layer.
-    :type size: int
-    :param act: Activation Type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-        assert not isinstance(param_attr, collections.Sequence)
-        param_attr = [param_attr]
-    else:
-        if isinstance(param_attr, collections.Sequence):
-            assert len(input) == len(param_attr)
-        else:
-            if "parameter_name" in param_attr.attr and len(input) > 1:
-                logger.fatal(
-                    "When the name field of param_attr is manually specified "
-                    "and the input is a list, the param_attr should also be a "
-                    "list with each item being the param_attr for each input "
-                    "item. If only one named param_attr is provided, all the "
-                    "input items would share this parameter.")
-            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
-
-    assert isinstance(input, collections.Sequence)
-
-    Layer(
-        inputs=[
-            Input(ipt.name, **attr.attr) for ipt, attr in zip(input, param_attr)
-        ],
-        name=name,
-        type=LayerType.FC_LAYER,
-        size=size,
-        bias=ParamAttr.to_bias(bias_attr),
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.FC_LAYER, input, activation=act, size=size)
-
-
-@wrap_name_default("print")
-def printer_layer(input, format=None, name=None):
-    """
-    Print the output value of the layers specified by the parameter input.
-    This layer is useful for debugging.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput | list | tuple
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-    assert isinstance(input, collections.Sequence)  # list or tuple
-    for each in input:
-        assert isinstance(each, LayerOutput)
-
-    Layer(
-        name=name,
-        format=format,
-        type=LayerType.PRINT_LAYER,
-        inputs=[l.name for l in input], )
-    # this layer don't return anything, can not be input of other layer.
-
-# Keep print_layer for compatibility with V1 API.
-# 'print_layer' does not work for V2 API because it will be changed to
-# 'print' for V2 API. But 'print' is a reserved key word in python.
-
-
-print_layer = printer_layer
-
-
-@wrap_name_default("priorbox")
-def priorbox_layer(input,
-                   image,
-                   aspect_ratio,
-                   variance,
-                   min_size,
-                   max_size=[],
-                   name=None):
-    """
-    Compute the priorbox and set the variance. This layer is necessary for ssd.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param image: The network input image.
-    :type image: LayerOutput
-    :param aspect_ratio: The aspect ratio.
-    :type aspect_ratio: list
-    :param variance: The bounding box variance.
-    :type min_size: The minimum size of the priorbox width/height.
-    :param min_size: list
-    :type max_size: The maximum size of the priorbox width/height. It could be NULL.
-    :param max_size: list
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    # plus one for ratio 1.
-    num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
-    size = (input.size / input.num_filters) * num_filters * 2
-    Layer(
-        name=name,
-        type=LayerType.PRIORBOX_LAYER,
-        inputs=[input.name, image.name],
-        size=size,
-        min_size=min_size,
-        max_size=max_size,
-        aspect_ratio=aspect_ratio,
-        variance=variance)
-    return LayerOutput(
-        name,
-        LayerType.PRIORBOX_LAYER,
-        parents=[input, image],
-        num_filters=num_filters,
-        size=size)
-
-
-@wrap_name_default("multibox_loss")
-def multibox_loss_layer(input_loc,
-                        input_conf,
-                        priorbox,
-                        label,
-                        num_classes,
-                        overlap_threshold=0.5,
-                        neg_pos_ratio=3.0,
-                        neg_overlap=0.5,
-                        background_id=0,
-                        name=None):
-    """
-    Compute the location loss and the confidence loss for ssd.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input_loc: The input predicted locations.
-    :type input_loc: LayerOutput | List of LayerOutput
-    :param input_conf: The input priorbox confidence.
-    :type input_conf: LayerOutput | List of LayerOutput
-    :param priorbox: The input priorbox location and the variance.
-    :type priorbox: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param num_classes: The number of the classification.
-    :type num_classes: int
-    :param overlap_threshold: The threshold of the overlap.
-    :type overlap_threshold: float
-    :param neg_pos_ratio: The ratio of the negative bounding box to
-                          the positive bounding box.
-    :type neg_pos_ratio: float
-    :param neg_overlap: The negative bounding box overlap threshold.
-    :type neg_overlap: float
-    :param background_id: The background class index.
-    :type background_id: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input_loc, LayerOutput):
-        input_loc = [input_loc]
-    assert isinstance(input_loc, collections.Sequence)  # list or tuple
-    for each in input_loc:
-        assert isinstance(each, LayerOutput)
-    input_loc_num = len(input_loc)
-
-    if isinstance(input_conf, LayerOutput):
-        input_conf = [input_conf]
-    assert isinstance(input_conf, collections.Sequence)  # list or tuple
-    for each in input_conf:
-        assert isinstance(each, LayerOutput)
-    input_conf_num = len(input_conf)
-    # Check the input layer number.
-    assert input_loc_num == input_conf_num
-
-    inputs = [priorbox.name, label.name]
-    inputs.extend([l.name for l in input_loc])
-    inputs.extend([l.name for l in input_conf])
-    parents = [priorbox, label]
-    parents.extend(input_loc)
-    parents.extend(input_conf)
-
-    Layer(
-        name=name,
-        type=LayerType.MULTIBOX_LOSS_LAYER,
-        inputs=inputs,
-        input_num=input_loc_num,
-        num_classes=num_classes,
-        overlap_threshold=overlap_threshold,
-        neg_pos_ratio=neg_pos_ratio,
-        neg_overlap=neg_overlap,
-        background_id=background_id)
-    return LayerOutput(
-        name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1)
-
-
-@wrap_name_default("detection_output")
-def detection_output_layer(input_loc,
-                           input_conf,
-                           priorbox,
-                           num_classes,
-                           nms_threshold=0.45,
-                           nms_top_k=400,
-                           keep_top_k=200,
-                           confidence_threshold=0.01,
-                           background_id=0,
-                           name=None):
-    """
-    Apply the NMS to the output of network and compute the predict bounding
-    box location. The output's shape of this layer could be zero if there is
-    no valid bounding box.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input_loc: The input predict locations.
-    :type input_loc: LayerOutput | List of LayerOutput.
-    :param input_conf: The input priorbox confidence.
-    :type input_conf: LayerOutput | List of LayerOutput.
-    :param priorbox: The input priorbox location and the variance.
-    :type priorbox: LayerOutput
-    :param num_classes: The number of the classes.
-    :type num_classes: int
-    :param nms_threshold: The Non-maximum suppression threshold.
-    :type nms_threshold: float
-    :param nms_top_k: The bounding boxes number kept of the NMS's output.
-    :type nms_top_k: int
-    :param keep_top_k: The bounding boxes number kept of the layer's output.
-    :type keep_top_k: int
-    :param confidence_threshold: The classification confidence threshold.
-    :type confidence_threshold: float
-    :param background_id: The background class index.
-    :type background_id: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input_loc, LayerOutput):
-        input_loc = [input_loc]
-    assert isinstance(input_loc, collections.Sequence)  # list or tuple
-    for each in input_loc:
-        assert isinstance(each, LayerOutput)
-    input_loc_num = len(input_loc)
-
-    if isinstance(input_conf, LayerOutput):
-        input_conf = [input_conf]
-    assert isinstance(input_conf, collections.Sequence)  # list or tuple
-    for each in input_conf:
-        assert isinstance(each, LayerOutput)
-    input_conf_num = len(input_conf)
-
-    # Check the input layer number.
-    assert input_loc_num == input_conf_num
-
-    inputs = [priorbox.name]
-    inputs.extend([l.name for l in input_loc])
-    inputs.extend([l.name for l in input_conf])
-    parents = [priorbox]
-    parents.extend(input_loc)
-    parents.extend(input_conf)
-
-    size = keep_top_k * 7
-
-    Layer(
-        name=name,
-        type=LayerType.DETECTION_OUTPUT_LAYER,
-        inputs=inputs,
-        size=size,
-        input_num=input_loc_num,
-        num_classes=num_classes,
-        nms_threshold=nms_threshold,
-        nms_top_k=nms_top_k,
-        keep_top_k=keep_top_k,
-        confidence_threshold=confidence_threshold,
-        background_id=background_id)
-    return LayerOutput(
-        name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
-
-
-@wrap_name_default("roi_pool")
-def roi_pool_layer(input,
-                   rois,
-                   pooled_width,
-                   pooled_height,
-                   spatial_scale,
-                   num_channels=None,
-                   name=None):
-    """
-    A layer used by Fast R-CNN to extract feature maps of ROIs from the last
-    feature map.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input layer.
-    :type input: LayerOutput.
-    :param rois: The input ROIs' data.
-    :type rois: LayerOutput.
-    :param pooled_width: The width after pooling.
-    :type pooled_width: int
-    :param pooled_height: The height after pooling.
-    :type pooled_height: int
-    :param spatial_scale: The spatial scale between the image and feature map.
-    :type spatial_scale: float
-    :param num_channels: The number of the input channels.
-    :type num_channels: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-    size = num_channels * pooled_width * pooled_height
-    Layer(
-        name=name,
-        type=LayerType.ROI_POOL_LAYER,
-        inputs=[input.name, rois.name],
-        pooled_width=pooled_width,
-        pooled_height=pooled_height,
-        spatial_scale=spatial_scale,
-        num_channels=num_channels)
-    return LayerOutput(
-        name, LayerType.ROI_POOL_LAYER, parents=[input, rois], size=size)
-
-
-@wrap_name_default("cross_channel_norm")
-def cross_channel_norm_layer(input, name=None, param_attr=None):
-    """
-    Normalize a layer's output. This layer is necessary for ssd. This
-    layer applys normalization across the channels of each sample to
-    a convolutional layer's output and scales the output by a group of
-    trainable factors whose dimensions equal to the channel's number.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert input.num_filters is not None
-    Layer(
-        name=name,
-        type=LayerType.NORM_LAYER,
-        inputs=[
-            Input(
-                input.name,
-                norm=Norm(
-                    norm_type="cross-channel-norm",
-                    channels=input.num_filters,
-                    size=input.size,
-                    scale=0,
-                    pow=0,
-                    blocked=0),
-                **param_attr.attr)
-        ])
-    return LayerOutput(
-        name,
-        LayerType.NORM_LAYER,
-        parents=input,
-        num_filters=input.num_filters,
-        size=input.size)
-
-
-@wrap_name_default("seq_pooling")
-@wrap_bias_attr_default(has_bias=False)
-@wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
-@layer_support()
-def pooling_layer(input,
-                  pooling_type=None,
-                  name=None,
-                  bias_attr=None,
-                  agg_level=AggregateLevel.TO_NO_SEQUENCE,
-                  stride=-1,
-                  layer_attr=None):
-    """
-    Pooling layer for sequence inputs, not used for Image.
-
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and returns the pooling value of the sequence in the window as the output. Thus,
-    a long sequence will be shortened. Note that for sequence with sub-sequence, the
-    default value of stride is -1.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       seq_pool = pooling_layer(input=layer,
-                                pooling_type=AvgPooling(),
-                                agg_level=AggregateLevel.TO_NO_SEQUENCE)
-
-    :param agg_level: AggregateLevel.TO_NO_SEQUENCE or
-                      AggregateLevel.TO_SEQUENCE
-    :type agg_level: AggregateLevel
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param pooling_type: Type of pooling. MaxPooling is the default pooling.
-    :type pooling_type: BasePoolingType | None
-    :param stride: The step size between successive pooling regions.
-    :type stride: int
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    extra_dict = dict()
-    # noinspection PyUnresolvedReferences
-    if isinstance(pooling_type, AvgPooling):
-        extra_dict['average_strategy'] = pooling_type.strategy
-    elif isinstance(pooling_type, MaxPooling) and \
-                    pooling_type.output_max_index is not None:
-        assert isinstance(pooling_type.output_max_index, bool)
-        extra_dict['output_max_index'] = pooling_type.output_max_index
-    extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    if agg_level == AggregateLevel.TO_SEQUENCE:
-        assert stride == -1
-
-    Layer(
-        name=name,
-        type=pooling_type.name,
-        inputs=[Input(input.name)],
-        bias=ParamAttr.to_bias(bias_attr),
-        trans_type=agg_level,
-        stride=stride,
-        **extra_dict)
-
-    return LayerOutput(
-        name, pooling_type.name, parents=[input], size=input.size)
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
-@wrap_name_default("lstmemory")
-@layer_support()
-def lstmemory(input,
-              name=None,
-              size=None,
-              reverse=False,
-              act=None,
-              gate_act=None,
-              state_act=None,
-              bias_attr=None,
-              param_attr=None,
-              layer_attr=None):
-    """
-    Long Short-term Memory Cell.
-
-    The memory cell was implemented as follow equations.
-
-    ..  math::
-
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
-
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
-
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
-
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
-
-        h_t & = o_t tanh(c_t)
-
-
-    NOTE: In PaddlePaddle's implementation, the multiplications
-    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in the lstmemory layer,
-    so an additional mixed_layer with full_matrix_projection or a fc_layer must
-    be included in the configuration file to complete the input-to-hidden
-    mappings before lstmemory is called.
-
-    NOTE: This is a low level user interface. You can use network.simple_lstm
-    to config a simple plain lstm layer.
-
-    Reference:
-        `Generating Sequences With Recurrent Neural Networks
-        <https://arxiv.org/pdf/1308.0850.pdf>`_
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param size: DEPRECATED. The dimension of the lstm cell.
-    :type size: int
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param reverse: Whether the input sequence is processed in a reverse order.
-    :type reverse: bool
-    :param act: Activation type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param gate_act: Activation type of this layer's gates. SigmoidActivation is the
-                     default activation.
-    :type gate_act: BaseActivation
-    :param state_act: Activation type of the state. TanhActivation is the default activation.
-    :type state_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert gate_act.support_hppl
-    assert state_act.support_hppl
-    assert act.support_hppl
-    assert input.size is not None and input.size % 4 == 0
-
-    if size is not None:
-        if input.size / 4 == size:
-            plog = logger.warning
-        else:
-            plog = logger.fatal
-        plog("size of lstmemory layer: %s is automatically set to "
-             "size of input layer / 4. The parameter size passing to "
-             "this layer is ignored." % (name))
-
-    Layer(
-        name=name,
-        type=LayerType.LSTMEMORY,
-        active_type=act.name,
-        active_state_type=state_act.name,
-        active_gate_type=gate_act.name,
-        reversed=reverse,
-        bias=ParamAttr.to_bias(bias_attr),
-        inputs=[Input(input.name, **param_attr.attr)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.LSTMEMORY, [input],
-        size=input.size / 4,
-        reverse=reverse)
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(param_names=["act"], act=TanhActivation())
-@wrap_name_default("gru")
-@layer_support()
-def grumemory(input,
-              size=None,
-              name=None,
-              reverse=False,
-              act=None,
-              gate_act=None,
-              bias_attr=None,
-              param_attr=None,
-              layer_attr=None):
-    """
-    Gate Recurrent Unit Layer.
-
-    The memory cell was implemented as follow equations.
-
-    1. update gate :math:`z`: defines how much of the previous memory to
-    keep around or the unit updates its activations. The update gate
-    is computed by:
-
-    ..  math::
-
-        z_t = \\sigma(W_{z}x_{t} + U_{z}h_{t-1} + b_z)
-
-    2. reset gate :math:`r`: determines how to combine the new input with the
-    previous memory. The reset gate is computed similarly to the update gate:
-
-    ..  math::
-
-        r_t = \\sigma(W_{r}x_{t} + U_{r}h_{t-1} + b_r)
-
-    3. The candidate activation :math:`\\tilde{h_t}` is computed similarly to
-    that of the traditional recurrent unit:
-
-    ..  math::
-
-        {\\tilde{h_t}} = tanh(W x_{t} + U (r_{t} \odot h_{t-1}) + b)
-
-    4. The hidden activation :math:`h_t` of the GRU at time t is a linear
-    interpolation between the previous activation :math:`h_{t-1}` and the
-    candidate activation :math:`\\tilde{h_t}`:
-
-    ..  math::
-
-        h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
-
-    NOTE: In PaddlePaddle's implementation, the multiplication operations
-    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not performed
-    in gate_recurrent layer. Consequently, an additional mixed_layer with
-    full_matrix_projection or a fc_layer must be included before grumemory
-    is called.
-
-    Reference:
-        `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
-        <https://arxiv.org/abs/1412.3555>`_
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       gru = grumemory(input)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput.
-    :param size: DEPRECATED. The dimension of the gru cell.
-    :type size: int
-    :param reverse: Whether the input sequence is processed in a reverse order.
-    :type reverse: bool
-    :param act: Activation type, TanhActivation is the default. This activation
-                affects the :math:`{\\tilde{h_t}}`.
-    :type act: BaseActivation
-    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
-                     the default activation. This activation affects the :math:`z_t`
-                     and :math:`r_t`. It is the :math:`\\sigma` in the above formula.
-    :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert act.support_hppl
-    assert gate_act.support_hppl
-    assert input.size is not None and input.size % 3 == 0
-    if size is not None:
-        if input.size / 3 == size:
-            plog = logger.warning
-        else:
-            plog = logger.fatal
-        plog("size of grumemory layer: %s is automatically set to "
-             "size of input layer / 3. The parameter size passing to this "
-             "layer is ignored." % (name))
-
-    Layer(
-        name=name,
-        type=LayerType.GRUMEMORY,
-        active_type=act.name,
-        active_gate_type=gate_act.name,
-        reversed=reverse,
-        bias=ParamAttr.to_bias(bias_attr),
-        inputs=[Input(input.name, **param_attr.attr)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.GRUMEMORY, [input],
-        size=input.size / 3,
-        reverse=reverse)
-
-
-@wrap_name_default()
-@layer_support()
-def last_seq(input,
-             name=None,
-             agg_level=AggregateLevel.TO_NO_SEQUENCE,
-             stride=-1,
-             layer_attr=None):
-    """
-    Get Last Timestamp Activation of a sequence.
-
-    If stride > 0, this layer will slide a window whose size is determined by stride,
-    and return the last value of the sequence in the window as the output. Thus, a
-    long sequence will be shortened. Note that for sequence with sub-sequence, the
-    default value of stride is -1.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       seq = last_seq(input=layer)
-
-    :param agg_level: Aggregated level
-    :type agg_level: AggregateLevel
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param stride: The step size between successive pooling regions.
-    :type stride: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if input.reverse is not None and input.reverse:
-        logger.warning("You are getting the last instance of a sequence that"
-                       " is a output of a REVERSED layer. There is no time"
-                       " series information at all. Maybe you want to use"
-                       " first_seq instead.")
-
-    if agg_level == AggregateLevel.TO_SEQUENCE:
-        assert stride == -1
-
-    Layer(
-        name=name,
-        type=LayerType.SEQUENCE_LAST_INSTANCE,
-        inputs=[input.name],
-        trans_type=agg_level,
-        stride=stride,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.SEQUENCE_LAST_INSTANCE,
-        parents=[input],
-        size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def first_seq(input,
-              name=None,
-              agg_level=AggregateLevel.TO_NO_SEQUENCE,
-              stride=-1,
-              layer_attr=None):
-    """
-    Get First Timestamp Activation of a sequence.
-
-    If stride > 0, this layer will slide a window whose size is determined by stride,
-    and return the first value of the sequence in the window as the output. Thus, a
-    long sequence will be shortened. Note that for sequence with sub-sequence, the
-    default value of stride is -1.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       seq = first_seq(input=layer)
-
-    :param agg_level: aggregation level
-    :type agg_level: AggregateLevel
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param stride: The step size between successive pooling regions.
-    :type stride: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if input.reverse is not None and not input.reverse:
-        logger.warning('You are getting the first instance for a time series,'
-                       ' and it is a normal recurrent layer output. There is no'
-                       ' time series information at all. Maybe you want to use'
-                       ' last_seq instead.')
-
-    if agg_level == AggregateLevel.TO_SEQUENCE:
-        assert stride == -1
-
-    Layer(
-        name=name,
-        type=LayerType.SEQUENCE_FIRST_INSTANCE,
-        inputs=[input.name],
-        trans_type=agg_level,
-        stride=stride,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.SEQUENCE_FIRST_INSTANCE,
-        parents=[input],
-        size=input.size)
-
-
-class ExpandLevel(object):
-    """
-    Please refer to AggregateLevel first.
-
-    ExpandLevel supports two modes:
-
-    - :code:`ExpandLevel.FROM_NO_SEQUENCE` means the expansion acts on
-      :code:`NO_SEQUENCE`, which will be expanded to
-      :code:`SEQUENCE` or :code:`SUB_SEQUENCE`.
-
-    - :code:`ExpandLevel.FROM_SEQUENCE` means the expansion acts on
-      :code:`SEQUENCE`, which will be expanded to
-      :code:`SUB_SEQUENCE`.
-    """
-    FROM_NO_SEQUENCE = AggregateLevel.TO_NO_SEQUENCE
-    FROM_SEQUENCE = AggregateLevel.TO_SEQUENCE
-    # compatible with previous configuration
-    FROM_TIMESTEP = FROM_NO_SEQUENCE
-
-
-@wrap_name_default()
-@layer_support()
-def expand_layer(input,
-                 expand_as,
-                 name=None,
-                 bias_attr=False,
-                 expand_level=ExpandLevel.FROM_NO_SEQUENCE,
-                 layer_attr=None):
-    """
-    A layer for expanding dense data or (sequence data where the length of each
-    sequence is one) to sequence data.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       expand = expand_layer(input=layer1,
-                             expand_as=layer2,
-                             expand_level=ExpandLevel.FROM_NO_SEQUENCE)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param expand_as: Expand the input according to this layer's sequence infomation. And
-                      after the operation, the input expanded will have the same number of
-                      elememts as this layer.
-    :type expand_as: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param expand_level: Whether the input layer is a sequence or the element of a sequence.
-    :type expand_level: ExpandLevel
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    Layer(
-        inputs=[input.name, expand_as.name],
-        name=name,
-        bias=ParamAttr.to_bias(bias_attr=bias_attr),
-        type=LayerType.EXPAND_LAYER,
-        trans_type=expand_level,
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        size=input.size,
-        layer_type=LayerType.EXPAND_LAYER,
-        parents=[input, expand_as])
-
-
-@wrap_name_default()
-@wrap_act_default(act=IdentityActivation())
-@layer_support()
-def repeat_layer(input,
-                 num_repeats,
-                 as_row_vector=True,
-                 act=None,
-                 name=None,
-                 layer_attr=None):
-    """
-    A layer for repeating the input for num_repeats times.
-
-    If as_row_vector:
-
-    .. math::
-       y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
-
-    If not as_row_vector:
-
-    .. math::
-       y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
-
-
-    The example usage is:
-
-    .. code-block:: python
-
-       expand = repeat_layer(input=layer, num_repeats=4)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param num_repeats: The times of repeating the input.
-    :type num_repeats: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param as_row_vector: Whether to treat the input as row vectors or not. If
-                          the parameter is set to True, the repeating operation
-                          will be performed in the column direction. Otherwise,
-                          it will be performed in the row direction.
-    :type as_row_vector: bool
-    :param act: Activation type. IdentityActivation is the default activation.
-    :type act: BaseActivation
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    l = Layer(
-        inputs=[input.name],
-        name=name,
-        active_type=act.name,
-        num_filters=num_repeats,
-        as_row_vector=as_row_vector,
-        type=LayerType.FEATURE_MAP_EXPAND_LAYER,
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        size=l.config.size,
-        layer_type=LayerType.FEATURE_MAP_EXPAND_LAYER,
-        activation=act,
-        parents=[input])
-
-
-@wrap_name_default("seqreshape")
-@wrap_act_default(act=IdentityActivation())
-@wrap_bias_attr_default(has_bias=False)
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def seq_reshape_layer(input,
-                      reshape_size,
-                      act=None,
-                      name=None,
-                      layer_attr=None,
-                      bias_attr=None):
-    """
-    A layer for reshaping the sequence. Assume the input sequence has T instances,
-    the dimension of each instance is M, and the input reshape_size is N, then the
-    output sequence has T*M/N instances, the dimension of each instance is N.
-
-    Note that T*M/N must be an integer.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       reshape = seq_reshape_layer(input=layer, reshape_size=4)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param reshape_size: The dimension of the reshaped sequence.
-    :type reshape_size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param act: Activation type. IdentityActivation is the default activation.
-    :type act: BaseActivation
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    Layer(
-        inputs=[input.name],
-        name=name,
-        size=reshape_size,
-        type=LayerType.SEQUENCE_RESHAPE,
-        bias=ParamAttr.to_bias(bias_attr),
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        size=reshape_size,
-        layer_type=LayerType.SEQUENCE_RESHAPE,
-        parents=[input])
-
-
-@wrap_name_default()
-@layer_support()
-def interpolation_layer(input, weight, name=None, layer_attr=None):
-    """
-    This layer performs linear interpolation on two inputs,
-    which is used in NEURAL TURING MACHINE.
-
-    .. math::
-       y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
-
-    where :math:`x_1` and :math:`x_2` are two (batchSize x dataDim) inputs,
-    :math:`w` is (batchSize x 1) weight vector, and :math:`y` is
-    (batchSize x dataDim) output.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       interpolation = interpolation_layer(input=[layer1, layer2], weight=layer3)
-
-    :param input: The input of this layer.
-    :type input: list | tuple
-    :param weight: Weight layer.
-    :type weight: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, collections.Sequence)
-    assert len(input) == 2
-    assert isinstance(input[0], LayerOutput) and isinstance(input[1],
-                                                            LayerOutput)
-    if input[0].size is not None and input[1].size is not None:
-        assert input[0].size == input[1].size
-    assert isinstance(weight, LayerOutput)
-    if weight.size is not None:
-        assert weight.size == 1
-    Layer(
-        name=name,
-        type=LayerType.INTERPOLATION_LAYER,
-        inputs=[weight.name, input[0].name, input[1].name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.INTERPOLATION_LAYER,
-        parents=[weight, input[0], input[1]],
-        size=input[0].size)
-
-
-@wrap_name_default()
-@layer_support()
-def bilinear_interp_layer(input,
-                          out_size_x=None,
-                          out_size_y=None,
-                          name=None,
-                          layer_attr=None):
-    """
-    This layer implements bilinear interpolation on convolutional layer's output.
-
-    Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput.
-    :param out_size_x: The width of the output.
-    :type out_size_x: int
-    :param out_size_y: The height of the output.
-    :type out_size_y: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert input.layer_type == LayerType.CONV_LAYER
-    assert isinstance(input.activation, LinearActivation)
-    assert out_size_x > 0 and out_size_y > 0
-    assert input.num_filters is not None
-    num_channels = input.num_filters
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name,
-            bilinear_interp=BilinearInterp(
-                out_size_x=out_size_x,
-                out_size_y=out_size_y,
-                channels=num_channels)),
-        type=LayerType.BILINEAR_INTERP_LAYER,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.BILINEAR_INTERP_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def power_layer(input, weight, name=None, layer_attr=None):
-    """
-    This layer applies a power function to a vector element-wise,
-    which is used in NEURAL TURING MACHINE.
-
-    .. math::
-       y = x^w
-
-    where :math:`x` is an input vector, :math:`w` is a scalar exponent,
-    and :math:`y` is an output vector.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       power = power_layer(input=layer1, weight=layer2)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param weight: The exponent of the power.
-    :type weight: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput) and isinstance(weight, LayerOutput)
-    if weight.size is not None:
-        assert weight.size == 1
-    Layer(
-        name=name,
-        type=LayerType.POWER_LAYER,
-        inputs=[weight.name, input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.POWER_LAYER, parents=[input, weight], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def scaling_layer(input, weight, name=None, layer_attr=None):
-    """
-    A layer for multiplying input vector by weight scalar.
-
-    .. math::
-       y  = w x
-
-    where :math:`x` is size=dataDim input, :math:`w` is size=1 weight,
-    and :math:`y` is size=dataDim output.
-
-    Note that the above computation is for one sample. Multiple samples are
-    processed in one batch.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       scale = scaling_layer(input=layer1, weight=layer2)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param weight: The weight of each sample.
-    :type weight: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(weight, LayerOutput) and isinstance(input, LayerOutput)
-    if weight.size is not None:
-        assert weight.size == 1
-    Layer(
-        name=name,
-        type=LayerType.SCALING_LAYER,
-        inputs=[weight.name, input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SCALING_LAYER, parents=[weight, input], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def trans_layer(input, name=None, layer_attr=None):
-    """
-    A layer for transposing a minibatch matrix.
-
-    .. math::
-       y = x^\mathrm{T}
-
-    where :math:`x` is (M x N) input, and :math:`y` is (N x M) output.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       trans = trans_layer(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.TRANS_LAYER,
-        inputs=[input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.TRANS_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def rotate_layer(input, height, width, name=None, layer_attr=None):
-    """
-    A layer for rotating 90 degrees (clock-wise) for each feature channel,
-    usually used when the input sample is some image or feature map.
-
-    .. math::
-       y(j,i,:) = x(M-i-1,j,:)
-
-    where :math:`x` is (M x N x C) input, and :math:`y` is (N x M x C) output.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       rot = rotate_layer(input=layer,
-                          height=100,
-                          width=100)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param height: The height of the sample matrix.
-    :type height: int
-    :param width: The width of the sample matrix.
-    :type width: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    l = Layer(
-        name=name,
-        height=height,
-        width=width,
-        type=LayerType.ROTATE_LAYER,
-        inputs=[input.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.ROTATE_LAYER,
-        parents=[input],
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
-    """
-    Cosine Similarity Layer. The cosine similarity equation is here.
-
-    ..  math::
-        similarity = cos(\\theta) = {\\mathbf{a} \\cdot \\mathbf{b}
-        \\over \\|\\mathbf{a}\\| \\|\\mathbf{b}\\|}
-
-    The size of a is M, size of b is M*N,
-    Similarity will be calculated N times by step M. The output size is
-    N. The scale will be multiplied to similarity.
-
-    Note that the above computation is for one sample. Multiple samples are
-    processed in one batch.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cos = cos_sim(a=layer1, b=layer2, size=3)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param a: The first input of this layer.
-    :type a: LayerOutput
-    :param b: The second input of this layer.
-    :type b: LayerOutput
-    :param scale: The scale of the cosine similarity. 1 is the default value.
-    :type scale: float
-    :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
-    :type size: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
-    if size == 1:
-        Layer(
-            name=name,
-            type=LayerType.COSINE_SIM,
-            cos_scale=scale,
-            inputs=[a.name, b.name],
-            **ExtraLayerAttribute.to_kwargs(layer_attr))
-    else:
-        if a.size is not None and b.size is not None:
-            assert size == b.size / a.size
-        Layer(
-            name=name,
-            type=LayerType.COSINE_SIM_VEC,
-            size=size,
-            cos_scale=scale,
-            inputs=[a.name, b.name],
-            **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b], size=size)
-
-
-@wrap_name_default()
-@layer_support()
-def l2_distance_layer(x, y, name=None, layer_attr=None):
-    """
-    This layer calculates and returns the Euclidean distance between two input
-    vectors x and y. The equation is as follows:
-
-    ..  math::
-        l2_distance(\\mathbf{x}, \\mathbf{y}) = \\sqrt{\\sum_{i=1}^D(x_i - y_i)}
-
-    The output size of this layer is fixed to be 1. Note that the above
-    computation is for one sample. Multiple samples are processed in one batch.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       l2_sim = l2_distance(x=layer1, y=layer2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param x: The first input x for this layer, whose output is a matrix with
-              dimensionality N x D. N is the sample number in a mini-batch.
-              D is the dimensionality of x's output.
-    :type x: LayerOutput
-    :param y: The second input y for this layer, whose output is a matrix with
-              dimensionality N x D. N is the sample number in a mini-batch.
-              D is the dimensionality of y's output.
-    :type y: LayerOutput
-    :param layer_attr: The extra layer attributes, for example, drop rate.
-                       See ExtraLayerAttribute for more details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: The returned LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(x, LayerOutput) and isinstance(y, LayerOutput)
-    Layer(
-        name=name,
-        type=LayerType.L2_DISTANCE,
-        inputs=[x.name, y.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.L2_DISTANCE, parents=[x, y], size=1)
-
-
-@wrap_name_default()
-@wrap_bias_attr_default(has_bias=True)
-@wrap_param_attr_default()
-@layer_support()
-def hsigmoid(input,
-             label,
-             num_classes=None,
-             name=None,
-             bias_attr=None,
-             param_attr=None,
-             layer_attr=None):
-    """
-    Organize the classes into a binary tree. At each node, a sigmoid function
-    is used to calculate the probability of belonging to the right branch.
-
-    Reference:
-        `Hierarchical Probabilistic Neural Network Language Model
-        <http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        cost = hsigmoid(input=[layer1, layer2],
-                        label=data_layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput | list | tuple
-    :param label: The input label.
-    :type label: LayerOutput
-    :param num_classes: The number of classes. And it should be larger than 2. If the parameter
-                        is not set or set to None, its actual value will be automatically set to
-                        the number of labels.
-    :type num_classes: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-        if not isinstance(param_attr, collections.Sequence):
-            param_attr = [param_attr]
-    else:
-        if not isinstance(param_attr, collections.Sequence):
-            param_attr = [param_attr] * len(input)
-        else:
-            assert len(param_attr) == len(input)
-
-    assert isinstance(input, collections.Sequence)
-    assert isinstance(label, LayerOutput)
-    assert label.layer_type == LayerType.DATA
-
-    if num_classes is None:
-        num_classes = label.size
-    if num_classes is None or num_classes <= 2:
-        raise ValueError("hsigmoid label size must larger than 2.")
-
-    ipts_for_layer = []
-    parents = []
-    for each_input, each_param_attr in zip(input, param_attr):
-        assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(Input(each_input.name, **each_param_attr.attr))
-        parents.append(each_input)
-    ipts_for_layer.append(label.name)
-    parents.append(label)
-
-    l = Layer(
-        name=name,
-        type=LayerType.HSIGMOID,
-        num_classes=num_classes,
-        bias=ParamAttr.to_bias(bias_attr),
-        inputs=ipts_for_layer,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.HSIGMOID, parents=parents, size=l.config.size)
-
-
-@wrap_name_default("conv")
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default(act=ReluActivation())
-@layer_support(DROPOUT)
-def img_conv_layer(input,
-                   filter_size,
-                   num_filters,
-                   name=None,
-                   num_channels=None,
-                   act=None,
-                   groups=1,
-                   stride=1,
-                   padding=0,
-                   dilation=1,
-                   bias_attr=None,
-                   param_attr=None,
-                   shared_biases=True,
-                   layer_attr=None,
-                   filter_size_y=None,
-                   stride_y=None,
-                   padding_y=None,
-                   dilation_y=None,
-                   trans=False,
-                   layer_type=None):
-    """
-    Convolution layer for image. Paddle can support both square and non-square
-    input currently.
-
-    The details of convolution layer, please refer UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/
-    FeatureExtractionUsingConvolution/>`_ .
-
-    Convolution Transpose (deconv) layer for image. Paddle can support both square
-    and non-square input currently.
-
-    The details of convolution transpose layer,
-    please refer to the following explanation and references therein
-    <http://datascience.stackexchange.com/questions/6107/
-    what-are-deconvolutional-layers/>`_ .
-    The num_channel means input image's channel number. It may be 1 or 3 when
-    input is raw pixels of image(mono or RGB), or it may be the previous layer's
-    num_filters.
-
-    There are several groups of filters in PaddlePaddle implementation.
-    If the groups attribute is greater than 1, for example groups=2,
-    the input will be splitted into 2 parts along the channel axis, and
-    the filters will also be splitted into 2 parts. The first half of the filters 
-    is only connected to the first half of the input channels, while the second 
-    half of the filters is only connected to the second half of the input. After
-    the computation of convolution for each part of input,
-    the output will be obtained by concatenating the two results.
-
-    The details of grouped convolution, please refer to:
-    `ImageNet Classification With Deep Convolutional Neural Networks
-    <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
-    
-    The example usage is:
-
-    ..  code-block:: python
-
-        conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
-                              num_channels=8,
-                              num_filters=16, stride=1,
-                              bias_attr=False,
-                              act=ReluActivation())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param filter_size: The dimensions of the filter kernel. If the parameter is
-                        set to one integer, the two dimensions on x and y axises
-                        will be same when filter_size_y is not set. If it is set
-                        to a list, the first element indicates the dimension on
-                        the x axis, and the second is used to specify the dimension
-                        on the y axis when filter_size_y is not provided.
-    :type filter_size: int | tuple | list
-    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
-                          is not set, it will be set automatically according to filter_size.
-    :type filter_size_y: int
-    :param num_filters: The number of filters. It is as same as the output image channel.
-    :type num_filters: int
-    :param act: Activation type. ReluActivation is the default activation.
-    :type act: BaseActivation
-    :param groups: The group number. 1 is the default group number.
-    :type groups: int
-    :param stride: The strides. If the parameter is set to one integer, the strides
-                   on x and y axises will be same when stride_y is not set. If it is
-                   set to a list, the first element indicates the stride on the x axis,
-                   and the second is used to specify the stride on the y axis when
-                   stride_y is not provided. 1 is the default value.
-    :type stride: int | tuple | list
-    :param stride_y: The stride on the y axis.
-    :type stride_y: int
-    :param padding: The padding sizes. If the parameter is set to one integer, the padding
-                    sizes on x and y axises will be same when padding_y is not set. If it
-                    is set to a list, the first element indicates the padding size on the
-                    x axis, and the second is used to specify the padding size on the y axis
-                    when padding_y is not provided. 0 is the default padding size.
-    :type padding: int | tuple | list
-    :param padding_y: The padding size on the y axis.
-    :type padding_y: int
-    :param dilation: The dimensions of the dilation. If the parameter is set to one integer,
-                     the two dimensions on x and y axises will be same when dilation_y is not
-                     set. If it is set to a list, the first element indicates the dimension
-                     on the x axis, and the second is used to specify the dimension on the y
-                     axis when dilation_y is not provided. 1 is the default dimension.
-    :type dilation: int | tuple | list
-    :param dilation_y: The dimension of the dilation on the y axis.
-    :type dilation_y: int
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channel number of the input.
-    :type num_channels: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param shared_biases: Whether biases will be shared between filters or not.
-    :type shared_biases: bool
-    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param trans: True if it is a convTransLayer, False if it is a convLayer
-    :type trans: bool
-    :param layer_type: Specify the layer type. If the dilation's dimension on one axis is
-                       larger than 1, layer_type has to be "cudnn_conv" or "cudnn_convt".
-                       If trans=True, layer_type has to be "exconvt" or "cudnn_convt",
-                       otherwise layer_type has to be either "exconv" or "cudnn_conv".
-    :type layer_type: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if filter_size_y is None:
-        if isinstance(filter_size, collections.Sequence):
-            assert len(filter_size) == 2
-            filter_size, filter_size_y = filter_size
-        else:
-            filter_size_y = filter_size
-
-    if stride_y is None:
-        if isinstance(stride, collections.Sequence):
-            assert len(stride) == 2
-            stride, stride_y = stride
-        else:
-            stride_y = stride
-
-    if padding_y is None:
-        if isinstance(padding, collections.Sequence):
-            assert len(padding) == 2
-            padding, padding_y = padding
-        else:
-            padding_y = padding
-
-    if dilation_y is None:
-        if isinstance(dilation, collections.Sequence):
-            assert len(dilation) == 2
-            dilation, dilation_y = dilation
-        else:
-            dilation_y = dilation
-
-    if param_attr.attr.get('initial_smart'):
-        # special initial for conv layers.
-        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
-        param_attr.attr["initial_mean"] = 0.0
-        param_attr.attr["initial_std"] = init_w
-        param_attr.attr["initial_strategy"] = 0
-        param_attr.attr["initial_smart"] = False
-
-    if layer_type:
-        if dilation > 1 or dilation_y > 1:
-            assert layer_type in [
-                "cudnn_conv", "cudnn_convt", "exconv", "exconvt"
-            ]
-        if trans:
-            assert layer_type in ["exconvt", "cudnn_convt"]
-        else:
-            assert layer_type in ["exconv", "cudnn_conv"]
-        lt = layer_type
-    else:
-        lt = LayerType.CONVTRANS_LAYER if trans else LayerType.CONV_LAYER
-
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name,
-            conv=Conv(
-                filter_size=filter_size,
-                padding=padding,
-                dilation=dilation,
-                stride=stride,
-                channels=num_channels,
-                groups=groups,
-                filter_size_y=filter_size_y,
-                padding_y=padding_y,
-                dilation_y=dilation_y,
-                stride_y=stride_y),
-            **param_attr.attr),
-        active_type=act.name,
-        num_filters=num_filters,
-        bias=ParamAttr.to_bias(bias_attr),
-        shared_biases=shared_biases,
-        type=lt,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        lt,
-        parents=[input],
-        activation=act,
-        num_filters=num_filters,
-        size=l.config.size)
-
-
-@wrap_name_default("pool")
-@layer_support()
-def img_pool_layer(input,
-                   pool_size,
-                   name=None,
-                   num_channels=None,
-                   pool_type=None,
-                   stride=1,
-                   padding=0,
-                   layer_attr=None,
-                   pool_size_y=None,
-                   stride_y=None,
-                   padding_y=None,
-                   ceil_mode=True,
-                   exclude_mode=None):
-    """
-    Image pooling Layer.
-
-    The details of pooling layer, please refer to ufldl's pooling_ .
-
-    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
-
-    - ceil_mode=True:
-
-    ..  math::
-
-        w & = 1 + ceil(\\frac{input\_width + 2 * padding - pool\_size}{stride})
-
-        h & = 1 + ceil(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
-
-    - ceil_mode=False:
-
-    ..  math::
-
-        w & = 1 + floor(\\frac{input\_width + 2 * padding - pool\_size}{stride})
-
-        h & = 1 + floor(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        maxpool = img_pool_layer(input=conv,
-                                 pool_size=3,
-                                 pool_size_y=5,
-                                 num_channels=8,
-                                 stride=1,
-                                 stride_y=2,
-                                 padding=1,
-                                 padding_y=2,
-                                 pool_type=MaxPooling())
-
-    :param padding: The padding size on the x axis. 0 is the default padding size.
-    :type padding: int
-    :param padding_y: The padding size on the y axis. If the parameter is not set
-                      or set to None, it will be set to 'padding' automatically.
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param pool_size: The pooling window length on the x axis.
-    :type pool_size: int
-    :param pool_size_y: The pooling window length on the y axis. If the parameter is
-                        not set or set to None, its actual value will be automatically
-                        set to pool_size.
-    :type pool_size_y: int
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling is the default pooling.
-    :type pool_type: BasePoolingType
-    :param stride: The stride on the x axis. 1 is the default value.
-    :type stride: int
-    :param stride_y: The stride on the y axis. If the parameter is not set or set to
-                     None, its actual value will be automatically set to 'stride'.
-    :type stride_y: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Whether to use the ceil function to calculate output height and width.
-                      True is the default. If it is set to False, the floor function will
-                      be used.
-    :type ceil_mode: bool
-    :param exclude_mode: Whether to exclude the padding cells when calculating, but only 
-                         work when pool_type is AvgPooling. If None, also exclude the padding 
-                         cells. If use cudnn, use CudnnAvgPooling or CudnnAvgInclPadPooling 
-                         as pool_type to identify the mode.
-    :type exclude_mode: bool
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if pool_type is None:
-        pool_type = MaxPooling()
-    elif isinstance(pool_type, AvgPooling):
-        pool_type.name = 'avg'
-
-    assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
-                               CudnnMaxPooling, CudnnAvgInclPadPooling], \
-        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
-
-    type_name = pool_type.name + '-projection' \
-        if (
-        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
-        else pool_type.name
-    pool_size_y = pool_size if pool_size_y is None else pool_size_y
-    stride_y = stride if stride_y is None else stride_y
-    padding_y = padding if padding_y is None else padding_y
-
-    l = Layer(
-        name=name,
-        type=LayerType.POOL_LAYER,
-        inputs=[
-            Input(
-                input.name,
-                pool=Pool(
-                    pool_type=type_name,
-                    channels=num_channels,
-                    size_x=pool_size,
-                    start=None,
-                    stride=stride,
-                    padding=padding,
-                    size_y=pool_size_y,
-                    stride_y=stride_y,
-                    padding_y=padding_y))
-        ],
-        ceil_mode=ceil_mode,
-        exclude_mode=exclude_mode,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.POOL_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default("pool3d")
-@layer_support()
-def img_pool3d_layer(input,
-                     pool_size,
-                     name=None,
-                     num_channels=None,
-                     pool_type=None,
-                     stride=1,
-                     padding=0,
-                     layer_attr=None,
-                     pool_size_y=None,
-                     stride_y=None,
-                     padding_y=None,
-                     pool_size_z=None,
-                     stride_z=None,
-                     padding_z=None,
-                     ceil_mode=True):
-    """
-    Image pooling Layer.
-
-    The details of pooling layer, please refer ufldl's pooling_ .
-
-    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
-
-    - ceil_mode=True:
-
-    ..  math::
-
-        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
-
-        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
-
-        d & = 1 + \\frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
-
-    - ceil_mode=False:
-
-    ..  math::
-
-        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
-
-        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
-
-        d & = 1 + \\frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        maxpool = img_pool3d_layer(input=conv,
-                                 pool_size=3,
-                                 num_channels=8,
-                                 stride=1,
-                                 padding=1,
-                                 pool_type=MaxPooling())
-
-    :param padding: pooling padding width.
-    :type padding: int | tuple | list
-    :param name: The name of this layer. It is optional.
-    :type name: basestring.
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param pool_size: The pooling window lengths along three axises. If the parameter
-                      is set to one integer, the three lengths will be same.
-    :type pool_size: int | tuple | list
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling is the default pooling.
-    :type pool_type: BasePoolingType
-    :param stride: The strides of the pooling along three axises. If the parameter
-                   is set to one integer, the three strides will be same. 1 is the
-                   default value.
-    :type stride: int | tuple | list
-    :param padding: The sizes of padding along three axises. If the parameter is set to
-                    one integer, they will be same. 0 is the default padding size.
-    :type padding: int | tuple | list
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
-                      True is the default. If it is set to False, the floor function will
-                      be used.
-    :type ceil_mode: bool
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if pool_type is None:
-        pool_type = MaxPooling()
-    elif isinstance(pool_type, AvgPooling):
-        pool_type.name = 'avg'
-
-    type_name = pool_type.name + '-projection' \
-        if (
-        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
-        else pool_type.name
-
-    if isinstance(pool_size, collections.Sequence):
-        assert len(pool_size) == 3
-        pool_size, pool_size_y, pool_size_z = pool_size
-    else:
-        pool_size_y = pool_size
-        pool_size_z = pool_size
-
-    if isinstance(stride, collections.Sequence):
-        assert len(stride) == 3
-        stride, stride_y, stride_z = stride
-    else:
-        stride_y = stride
-        stride_z = stride
-
-    if isinstance(padding, collections.Sequence):
-        assert len(padding) == 3
-        padding, padding_y, padding_y = padding
-    else:
-        padding_y = padding
-        padding_z = padding
-
-    l = Layer(
-        name=name,
-        type=LayerType.POOL3D_LAYER,
-        inputs=[
-            Input(
-                input.name,
-                pool=Pool3d(
-                    pool_type=type_name,
-                    channels=num_channels,
-                    size_x=pool_size,
-                    start=None,
-                    stride=stride,
-                    padding=padding,
-                    size_y=pool_size_y,
-                    stride_y=stride_y,
-                    padding_y=padding_y,
-                    size_z=pool_size_z,
-                    stride_z=stride_z,
-                    padding_z=padding_z))
-        ],
-        ceil_mode=ceil_mode,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.POOL_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default("upsample")
-@layer_support()
-def upsample_layer(input,
-                   name=None,
-                   scale=None,
-                   scale_y=None,
-                   upsample_size=None,
-                   upsample_size_y=None,
-                   pad_out_x=False,
-                   pad_out_y=False,
-                   layer_attr=None):
-    """
-    The DePooling process.
-    Inputs should be a list of length 2. The first input is a layer,
-    and the second input should be the MaxWithMaskPoolingLayer
-
-    The example usage is:
-
-    ..  code-block:: python
-        pool1 = paddle.v2.layer.img_pool(input=input, pool_size=2, stride=2,
-                                        pool_type=paddle.pooling.MaxWithMask())
-        upsample = paddle.v2.layer.upsample(input=[layer1, pool1])
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: contains an input layer and a MaxWithMaskPoolingLayer
-    :type input: list | tuple | collections.Sequence
-    :param scale: outputSize =  scale * inputSize
-    :type scale: int | list | tuple | .
-    :param scale_y: scale_y will be equal to scale, if it's value is None, 
-    :type scale: int | None. 
-    :param upsample_size: specify the outputSize.
-    :type upsample_size: int | list | tuple.
-    :param upsample_size_y: specify the y dimension outputSize.
-    :type upsample_size_y: int.
-    :param pad_out_x: specify exact x dimension size. This parameter only works when scale is 2
-    :type pad_out_x: bool.
-    :param pad_out_y: specify exact y dimension size. This parameter only works when scale is 2
-    :type pad_out_y: bool.
-    :param layer_attr: Extra Layer Attribute.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert (scale is not None) or (upsample_size is not None), \
-            'scale or upsample_size, there must be one to be designated'
-
-    assert len(input) == 2, 'layer input size must be 2'
-
-    assert input[1].layer_type == LayerType.POOL_LAYER, \
-            'the second input should be the MaxPoolWithMaskLayer'
-
-    scale_y = scale \
-            if scale is not None else scale_y
-    upsample_size_y = upsample_size  \
-            if upsample_size is not None else upsample_size_y
-
-    layer_type = LayerType.UPSAMPLE_LAYER
-
-    layer = Layer(
-        name=name,
-        type=layer_type,
-        inputs=[
-            Input(
-                input[0].name,
-                upsample=Upsample(scale, scale_y, pad_out_x, pad_out_y,
-                                  upsample_size, upsample_size_y)),
-            Input(input[1].name)
-        ],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    sz = layer.config.size
-
-    return LayerOutput(name, layer_type=layer_type, parents=input, size=sz)
-
-
-@wrap_name_default("spp")
-@layer_support()
-def spp_layer(input,
-              name=None,
-              num_channels=None,
-              pool_type=None,
-              pyramid_height=None,
-              layer_attr=None):
-    """
-    A layer performs spatial pyramid pooling.
-
-    Reference:
-        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
-        <https://arxiv.org/abs/1406.4729>`_
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        spp = spp_layer(input=data,
-                        pyramid_height=2,
-                        num_channels=16,
-                        pool_type=MaxPooling())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling is the default pooling.
-    :type scale: BasePoolingType
-    :param pyramid_height: The pyramid height of this pooling.
-    :type pyramid_height: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if pool_type is None:
-        pool_type = MaxPooling()
-    elif isinstance(pool_type, AvgPooling):
-        pool_type.name = 'avg'
-
-    type_name = pool_type.name
-    if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)):
-        type_name += '-projection'
-
-    l = Layer(
-        name=name,
-        type=LayerType.SPP_LAYER,
-        inputs=Input(
-            input.name,
-            spp=SpatialPyramidPool(
-                pool_type=type_name,
-                channels=num_channels,
-                pyramid_height=pyramid_height)),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        layer_type=LayerType.SPP_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-def __img_norm_layer__(name, input, size, norm_type, scale, power, num_channels,
-                       blocked, layer_attr):
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    l = Layer(
-        name=name,
-        type=LayerType.NORM_LAYER,
-        inputs=Input(
-            input.name,
-            norm=Norm(
-                norm_type=norm_type,
-                channels=num_channels,
-                size=size,
-                scale=scale,
-                pow=power,
-                blocked=blocked)),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        layer_type=LayerType.NORM_LAYER,
-        parents=[input],
-        num_filters=num_channels,
-        img_norm_type=norm_type,
-        size=l.config.size)
-
-
-@wrap_name_default("crmnorm")
-@layer_support()
-def img_cmrnorm_layer(input,
-                      size,
-                      scale=0.0128,
-                      power=0.75,
-                      name=None,
-                      num_channels=None,
-                      layer_attr=None):
-    """
-    Response normalization across feature maps.
-
-    Reference:
-        `ImageNet Classification with Deep Convolutional Neural Networks
-        <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        norm = img_cmrnorm_layer(input=net, size=5)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param size: Normalize in number of :math:`size` feature maps.
-    :type size: int
-    :param scale: The hyper-parameter.
-    :type scale: float
-    :param power: The hyper-parameter.
-    :type power: float
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    return __img_norm_layer__(name, input, size, "cmrnorm-projection", scale,
-                              power, num_channels, 0, layer_attr)
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default(
-    default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
-@wrap_act_default(act=ReluActivation())
-@wrap_name_default("batch_norm")
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def batch_norm_layer(input,
-                     act=None,
-                     name=None,
-                     img3D=False,
-                     num_channels=None,
-                     bias_attr=None,
-                     param_attr=None,
-                     layer_attr=None,
-                     batch_norm_type=None,
-                     epsilon=1e-5,
-                     moving_average_fraction=0.9,
-                     use_global_stats=None,
-                     mean_var_names=None):
-    """
-    Batch Normalization Layer. The notation of this layer is as follows.
-
-    :math:`x` is the input features over a mini-batch.
-
-    ..  math::
-
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
-
-    Reference:
-        `Batch Normalization: Accelerating Deep Network Training by Reducing
-        Internal Covariate Shift
-        <http://arxiv.org/abs/1502.03167>`_
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        norm = batch_norm_layer(input=net, act=ReluActivation())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: This layer's input which is to be performed batch normalization on.
-    :type input: LayerOutput
-    :param batch_norm_type: We have batch_norm, mkldnn_batch_norm and cudnn_batch_norm.
-                            batch_norm supports CPU, MKLDNN and GPU. cudnn_batch_norm
-                            requires cuDNN version greater or equal to v4 (>=v4).
-                            But cudnn_batch_norm is faster and needs less
-                            memory than batch_norm. mkldnn_batch_norm requires
-                            use_mkldnn is enabled. By default (None), we will
-                            automatically select cudnn_batch_norm for GPU,
-                            mkldnn_batch_norm for MKLDNN and batch_norm for CPU.
-                            Users can specify the batch norm type. If you use
-                            cudnn_batch_norm, we suggested you use latest version,
-                            such as v5.1.
-    :type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm"
-                           or "mkldnn_batch_norm"
-    :param act: Activation type. ReluActivation is the default activation.
-    :type act: BaseActivation
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param bias_attr: :math:`\\beta`. The bias attribute. If the parameter is set to
-                      False or an object whose type is not ParameterAttribute, no
-                      bias is defined. If the parameter is set to True, the bias is
-                      initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: :math:`\\gamma`. The parameter attribute. See ParameterAttribute
-                       for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param use_global_stats: Whether use moving mean/variance statistics during
-                             testing peroid. If the parameter is set to None or
-                             True, it will use moving mean/variance statistics
-                             during testing. If the parameter is set to False, it
-                             will use the mean and variance of the current batch
-                             of test data.
-    :type use_global_stats: bool | None.
-    :param epsilon: The small constant added to the variance to improve numeric stability.
-    :type epsilon: float.
-    :param moving_average_fraction: Factor used in the moving average computation.
-                                   :math:`runningMean = newMean*(1-factor) + runningMean*factor`
-    :type moving_average_fraction: float.
-    :param mean_var_names: [mean name, variance name]
-    :type mean_var_names: string list
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if num_channels is None:
-        if input.num_filters is not None:
-            num_channels = input.num_filters
-        else:
-            num_channels = input.size
-    assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
-           (batch_norm_type == "mkldnn_batch_norm") or \
-           (batch_norm_type == "cudnn_batch_norm")
-
-    l = Layer(
-        name=name,
-        img3D=img3D,
-        inputs=Input(
-            input.name, image=Image(channels=num_channels), **param_attr.attr),
-        active_type=act.name,
-        type=LayerType.BATCH_NORM_LAYER,
-        batch_norm_type=batch_norm_type,
-        bias=ParamAttr.to_bias(bias_attr),
-        epsilon=epsilon,
-        moving_average_fraction=moving_average_fraction,
-        use_global_stats=use_global_stats,
-        mean_var_names=mean_var_names,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.BATCH_NORM_LAYER,
-        parents=[input],
-        activation=act,
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def sum_to_one_norm_layer(input, name=None, layer_attr=None):
-    """
-    A layer for sum-to-one normalization,
-    which is used in NEURAL TURING MACHINE.
-
-    .. math::
-       out[i] = \\frac {in[i]} {\sum_{k=1}^N in[k]}
-
-    where :math:`in` is a (batchSize x dataDim) input vector,
-    and :math:`out` is a (batchSize x dataDim) output vector.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       sum_to_one_norm = sum_to_one_norm_layer(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
-                       for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.SUM_TO_ONE_NORM_LAYER,
-        inputs=[input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def row_l2_norm_layer(input, name=None, layer_attr=None):
-    """
-    A layer for L2-normalization in each row.
-
-    .. math::
-       out[i] = \\frac{in[i]} {\\sqrt{\\sum_{k=1}^N in[k]^{2}}}
-
-    where the size of :math:`in` is (batchSize x dataDim) ,
-    and the size of :math:`out` is a (batchSize x dataDim) .
-
-    The example usage is:
-
-    .. code-block:: python
-
-       row_l2_norm_layer = row_l2_norm_layer(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
-                       for details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.ROW_L2_NORM_LAYER,
-        inputs=[input.name],
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.ROW_L2_NORM_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default("addto")
-@wrap_act_default(act=LinearActivation())
-@wrap_bias_attr_default(has_bias=False)
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
-    """
-    AddtoLayer.
-
-    ..  math::
-
-        y = f(\\sum_{i} x_i + b)
-
-    where :math:`y` is output, :math:`x` is input, :math:`b` is bias,
-    and :math:`f` is activation function.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        addto = addto_layer(input=[layer1, layer2],
-                            act=ReluActivation(),
-                            bias_attr=False)
-
-    This layer just simply adds all input layers together, then activates the
-    sum. All inputs should share the same dimension, which is also the dimension
-    of this layer's output.
-
-    There is no weight matrix for each input, because it just a simple add
-    operation. If you want a complicated operation before add, please use
-    mixed_layer.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input layers. It could be a LayerOutput or list/tuple of
-                 LayerOutput.
-    :type input: LayerOutput | list | tuple
-    :param act: Activation Type. LinearActivation is the default activation.
-    :type act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    num_filters = None
-    if isinstance(input, LayerOutput):
-        input = [input]
-
-    assert isinstance(input, collections.Sequence)
-    ipts_for_layer = []
-    for each_input in input:
-        assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(Input(each_input.name))
-        if each_input.num_filters is not None:
-            num_filters = each_input.num_filters
-
-    l = Layer(
-        name=name,
-        type=LayerType.ADDTO_LAYER,
-        inputs=ipts_for_layer,
-        bias=ParamAttr.to_bias(bias_attr),
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.ADDTO_LAYER,
-        parents=input,
-        activation=act,
-        num_filters=num_filters,
-        size=l.config.size)
-
-
-@wrap_act_default(act=IdentityActivation())
-@wrap_name_default("concat")
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
-    """
-    Concatenate all input vectors to one vector.
-    Inputs can be a list of LayerOutput or a list of projection.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        concat = concat_layer(input=[layer1, layer2])
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input layers or projections
-    :type input: list | tuple | collections.Sequence
-    :param act: Activation type. IdentityActivation is the default activation.
-    :type act: BaseActivation
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if isinstance(input, LayerOutput):
-        input = [input]
-    elif isinstance(input, Projection):
-        input = [input]
-    else:
-        assert isinstance(input, collections.Sequence)
-
-    def __is_type__(o, tp):
-        if not isinstance(o, collections.Sequence):
-            if o == tp:
-                return True
-            elif len(o.__bases__) == 0:
-                return False
-            else:
-                for bs in o.__bases__:
-                    if __is_type__(bs, tp):
-                        return True
-                return False
-        else:
-            tmp = map(lambda _x: __is_type__(_x, tp), o)
-            a = tmp[0]
-            for b in tmp[1:]:
-                assert a == b
-            return a
-
-    def __reduce_concat_type__(a, b):
-        assert __is_type__([a, b], Projection) or __is_type__([a, b],
-                                                              LayerOutput)
-        return a
-
-    is_concat_layer = __is_type__(
-        reduce(__reduce_concat_type__, map(type, input)), LayerOutput)
-
-    layer_type = (LayerType.CONCAT_LAYER
-                  if is_concat_layer else LayerType.CONCAT_PROJ_LAYER)
-
-    if layer_type == LayerType.CONCAT_LAYER:
-        assert not bias_attr
-
-    layer = Layer(
-        name=name,
-        type=layer_type,
-        inputs=[x.name for x in input] if is_concat_layer else input,
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    sz = layer.config.size
-
-    return LayerOutput(
-        name,
-        layer_type=layer_type,
-        parents=input if is_concat_layer else [x.origin for x in input],
-        activation=act,
-        size=sz)
-
-
-@wrap_name_default("seqconcat")
-@wrap_act_default(act=IdentityActivation())
-@wrap_bias_attr_default(has_bias=False)
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
-                     bias_attr=None):
-    """
-    Concatenate sequence a and sequence b.
-
-    Inputs:
-      - a = [a1, a2, ..., am]
-      - b = [b1, b2, ..., bn]
-
-    Output: [a1, ..., am, b1, ..., bn]
-
-    Note that the above computation is for one sample. Multiple samples are
-    processed in one batch.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        concat = seq_concat_layer(a=layer1, b=layer2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param a: The first input sequence layer
-    :type a: LayerOutput
-    :param b: The second input sequence layer
-    :type b: LayerOutput
-    :param act: Activation type. IdentityActivation is the default activation.
-    :type act: BaseActivation
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
-    assert a.size == b.size
-    Layer(
-        name=name,
-        type=LayerType.SEQUENCE_CONCAT_LAYER,
-        inputs=[a.name, b.name],
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        layer_type=LayerType.SEQUENCE_CONCAT_LAYER,
-        parents=[a, b],
-        activation=act,
-        size=a.size)
-
-
-@wrap_name_default("memory", "memory_name")
-def memory(name,
-           size,
-           memory_name=None,
-           is_seq=False,
-           boot_layer=None,
-           boot_bias=None,
-           boot_bias_active_type=None,
-           boot_with_const_id=None):
-    """
-    The memory takes a layer's output at previous time step as its own output.
-
-    If boot_bias, the activation of the bias is the initial value of the memory.
-
-    If boot_with_const_id is set, then the memory's output at the first time step
-    is a IndexSlot, the Arguments.ids()[0] is this :code:`cost_id`.
-
-    If boot_layer is specified, the memory's output at the first time step will
-    be the boot_layer's output.
-
-    In other case, the default memory's output at the first time step is zero.
-
-    .. code-block:: python
-
-       mem = memory(size=256, name='state')
-       state = fc_layer(input=mem, size=256, name='state')
-
-    If you do not want to specify the name, you can also use set_input()
-    to specify the layer to be remembered as the following:
-
-    .. code-block:: python
-
-       mem = memory(size=256)
-       state = fc_layer(input=mem, size=256)
-       mem.set_input(mem)
-
-    :param name: The name of the layer which this memory remembers.
-                 If name is None, user should call set_input() to specify the
-                 name of the layer which this memory remembers.
-    :type name: basestring
-    :param size: The dimensionality of memory.
-    :type size: int
-    :param memory_name: The name of the memory. It is ignored when name is provided.
-    :type memory_name: basestring
-    :param is_seq: DEPRECATED. is sequence for boot_layer
-    :type is_seq: bool
-    :param boot_layer: This parameter specifies memory's output at the first time
-                       step and the output is boot_layer's output.
-    :type boot_layer: LayerOutput | None
-    :param boot_bias: The bias attribute of memory's output at the first time step.
-                      If the parameter is set to False or an object whose type is not
-                      ParameterAttribute, no bias is defined. If the parameter is set
-                      to True, the bias is initialized to zero.
-    :type boot_bias: ParameterAttribute | None
-    :param boot_bias_active_type: Activation type for memory's bias at the first time
-                                  step. LinearActivation is the default activation.
-    :type boot_bias_active_type: BaseActivation
-    :param boot_with_const_id: This parameter specifies memory's output at the first
-                               time step and the output is an index.
-    :type boot_with_const_id: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if boot_bias_active_type is None:
-        boot_bias_active_type = LinearActivation()
-
-    assert boot_bias is None or isinstance(boot_bias, ParameterAttribute)
-    if isinstance(boot_bias, ParameterAttribute):
-        boot_bias = ParamAttr.to_bias(boot_bias)
-
-    assert boot_layer is None or isinstance(boot_layer, LayerOutput)
-    if name is not None:
-        memory_name = None
-
-    memory_name = Memory(
-        name,
-        size,
-        boot_layer=boot_layer.name if boot_layer is not None else None,
-        boot_bias=boot_bias,
-        boot_bias_active_type=boot_bias_active_type.name,
-        boot_with_const_id=boot_with_const_id,
-        memory_name=memory_name)
-
-    lout = LayerOutput(
-        name=memory_name,
-        size=size,
-        layer_type=LayerType.MEMORY,
-        parents=[boot_layer] if boot_layer is not None else None)
-    return lout
-
-
-@wrap_bias_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(param_names=['state_act'], act=TanhActivation())
-@wrap_act_default(act=TanhActivation())
-@wrap_name_default('lstm_step')
-@layer_support()
-def lstm_step_layer(input,
-                    state,
-                    size=None,
-                    act=None,
-                    name=None,
-                    gate_act=None,
-                    state_act=None,
-                    bias_attr=None,
-                    layer_attr=None):
-    """
-    LSTM Step Layer. This function is used only in recurrent_group.
-    The lstm equations are shown as follows.
-
-    ..  math::
-
-        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
-
-        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
-
-        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
-
-        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
-
-        h_t & = o_t tanh(c_t)
-
-
-    The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
-    :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
-    input vectors.
-
-    The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do
-
-    ..  math::
-
-        i_t = \\sigma(input + W_{ci}c_{t-1} + b_i)
-
-        ...
-
-
-    This layer has two outputs. The default output is :math:`h_t`. The other
-    output is :math:`o_t`, whose name is 'state' and users can use
-    :code:`get_output_layer` to extract this output.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param size: The dimension of this layer's output, which must be
-                 equal to the dimension of the state.
-    :type size: int
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param state: The state of the LSTM unit.
-    :type state: LayerOutput
-    :param act: Activation type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param gate_act: Activation type of the gate. SigmoidActivation is the
-                     default activation.
-    :type gate_act: BaseActivation
-    :param state_act: Activation type of the state. TanhActivation is the
-                      default activation.
-    :type state_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert size is None or state.size == size
-    size = state.size
-    Layer(
-        name=name,
-        type=LayerType.LSTM_STEP_LAYER,
-        active_type=act.name,
-        active_gate_type=gate_act.name,
-        active_state_type=state_act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        size=state.size,
-        inputs=[input.name, state.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.LSTM_STEP_LAYER,
-        parents=[input, state],
-        activation=act,
-        size=size,
-        outputs=['default', 'state'])
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(act=TanhActivation())
-@wrap_name_default('gru_step')
-@layer_support()
-def gru_step_layer(input,
-                   output_mem,
-                   size=None,
-                   act=None,
-                   name=None,
-                   gate_act=None,
-                   bias_attr=None,
-                   param_attr=None,
-                   layer_attr=None):
-    """
-
-    :param input: The input of this layer, whose dimension can be divided by 3.
-    :type input: LayerOutput
-    :param output_mem: A memory which memorizes the output of this layer at previous
-                       time step.
-    :type output_mem: LayerOutput
-    :param size: The dimension of this layer's output. If it is not set or set to None,
-                 it will be set to one-third of the dimension of the input automatically.
-    :type size: int
-    :param act: Activation type of this layer's output. TanhActivation
-                is the default activation.
-    :type act: BaseActivation
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
-                     the default activation.
-    :type gate_act: BaseActivation
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute, no bias
-                      is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert input.size % 3 == 0
-    if size is None:
-        size = input.size / 3
-    Layer(
-        name=name,
-        type=LayerType.GRU_STEP_LAYER,
-        # The parameter here is for transforming the output_mem. The input has
-        # already been transformed outside this module so it does not need
-        # parameter associated with it.
-        # The parameter here is instead grouped with input is due to
-        # backward model compatibility.
-        inputs=[Input(input.name, **param_attr.attr), output_mem.name],
-        bias=ParamAttr.to_bias(bias_attr),
-        size=size,
-        active_type=act.name,
-        active_gate_type=gate_act.name,
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.GRU_STEP_LAYER,
-        parents=[input, output_mem],
-        size=size,
-        activation=act)
-
-
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
-@wrap_act_default(act=TanhActivation())
-@wrap_name_default('gru_step_naive')
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def gru_step_naive_layer(input,
-                         output_mem,
-                         size=None,
-                         name=None,
-                         act=None,
-                         gate_act=None,
-                         bias_attr=None,
-                         param_attr=None,
-                         layer_attr=None):
-    """
-    GRU Step Layer, which is realized using PaddlePaddle API. It supports ERROR_CLIPPING
-    and DROPOUT.
-
-    :param input: The input of this layer, whose dimensionality can be divided by 3.
-    :param output_mem: A memory which memorizes the output of this layer at previous
-                       time step.
-    :type output_mem: LayerOutput
-    :param size: The dimension of this layer's output. If it is not set or set to None,
-                 it will be set to one-third of the dimension of the input automatically.
-    :type size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param act: Activation type of this layer's output. TanhActivation
-                is the default activation.
-    :type act: BaseActivation
-    :param gate_act: Activation type of this layer's two gates. SigmoidActivation
-                     is the default activation.
-    :type gate_act: BaseActivation
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute, no bias
-                      is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if input.size % 3 != 0:
-        raise ValueError("GruStep input size must be divided by 3")
-    if size is None:
-        size = input.size / 3
-
-    if bias_attr and bias_attr.attr.get("parameter_name", None) is not None:
-        raise ValueError("You should not specify the field `name` in bias_attr."
-                         " Otherwise, the three biases, which correponding to "
-                         " the two gates and the mixed layer for computing Wx+b"
-                         ", will share the same parameter matrix unexpectedly.")
-
-    def __gate__(gate_name, offset):
-        with mixed_layer(
-                name=name + "_" + gate_name,
-                size=size,
-                layer_attr=layer_attr,
-                bias_attr=bias_attr,
-                act=gate_act) as gate:
-            gate += identity_projection(input=input, offset=offset)
-            gate += full_matrix_projection(
-                input=output_mem, param_attr=param_attr)
-        return gate
-
-    update_gate = __gate__("update", 0)
-    reset_gate = __gate__("reset", size)
-
-    with mixed_layer(
-            name=name + "_reset_output", bias_attr=False) as reset_output:
-        reset_output += dotmul_operator(a=output_mem, b=reset_gate)
-
-    with mixed_layer(
-            name=name + "_output_candidate",
-            size=size,
-            layer_attr=layer_attr,
-            bias_attr=bias_attr,
-            act=act) as output_candidate:
-        output_candidate += identity_projection(input=input, offset=2 * size)
-        output_candidate += full_matrix_projection(
-            input=reset_output, param_attr=param_attr)
-
-    with mixed_layer(name=name) as output:
-        output += identity_projection(output_mem)
-        output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
-        output += dotmul_operator(a=output_candidate, b=update_gate)
-
-    return output
-
-
-@wrap_name_default()
-@layer_support()
-def get_output_layer(input, arg_name, name=None, layer_attr=None):
-    """
-    Get layer's output by name. In PaddlePaddle, a layer might return multiple
-    values, but returns one layer's output. If the user wants to use another
-    output besides the default one, please use get_output_layer first to get
-    the output from input.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input layer. And this layer should contain
-                   multiple outputs.
-    :type input: LayerOutput
-    :param arg_name: The name of the output to be extracted from the input layer.
-    :type arg_name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    # GetOutputLayer
-    assert arg_name in input.outputs, 'Get Output From an not existed input.' \
-                                      ' The get output name is %s, which not' \
-                                      ' in %s' % (
-                                          arg_name, ",".join(input.outputs))
-    Layer(
-        name=name,
-        type=LayerType.GET_OUTPUT_LAYER,
-        inputs=[Input(
-            input.name, input_layer_argument=arg_name)],
-        size=input.size,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.GET_OUTPUT_LAYER,
-        parents=[input],
-        size=input.size)
-
-
-@wrap_name_default()
-@wrap_act_default()
-@wrap_bias_attr_default()
-@wrap_param_attr_default()
-@layer_support()
-def recurrent_layer(input,
-                    act=None,
-                    bias_attr=None,
-                    param_attr=None,
-                    name=None,
-                    reverse=False,
-                    layer_attr=None):
-    """
-    Simple recurrent unit layer. It is just a fully connect layer through both
-    time and neural network.
-
-    For each sequence [start, end] it performs the following computation\:
-
-    ..  math::
-
-        out_{i} = act(in_{i})     \\      \\      \\text{for} \\ i = start \\\\
-        out_{i} = act(in_{i} + out_{i-1} * W) \\ \\ \\text{for} \\ start < i <= end
-
-    If reversed is true, the order is reversed\:
-
-    ..  math::
-
-        out_{i} = act(in_{i})           \\    \\   \\text{for} \\ i = end  \\\\
-        out_{i} = act(in_{i} + out_{i+1} * W) \\ \\ \\text{for} \\ start <= i < end
-
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param act: Activation type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute,
-                      no bias is defined. If the parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.RECURRENT_LAYER,
-        inputs=Input(input.name, **param_attr.attr),
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        reversed=reverse,
-        **ExtraAttr.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.RECURRENT_LAYER,
-        parents=[input],
-        size=input.size,
-        activation=act,
-        reverse=reverse)
-
-
-class StaticInput(object):
-    """
-    StaticInput is only used in recurrent_group which defines a read-only memory
-    and can be a sequence or non-sequence.
-    :param size: DEPRECATED
-    :param is_seq: DEPRECATED
-    """
-
-    def __init__(self, input, is_seq=False, size=None):
-        assert isinstance(input, LayerOutput)
-        self.input = input
-        assert input.size is not None
-        if size is not None:
-            assert input.size == size
-
-
-def SubsequenceInput(input):
-    """
-    DEPRECATED.
-    Input sequence has sub-sequence, used in recurrent_group.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       input = SubsequenceInput(layer)
-    """
-    return input
-
-
-@wrap_name_default("recurrent_group")
-def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
-    """
-    Recurrent layer group is an extremely flexible recurrent unit in
-    PaddlePaddle. As long as the user defines the calculation done within a
-    time step, PaddlePaddle will iterate such a recurrent calculation over
-    sequence input. This is useful for attention-based models, or Neural
-    Turning Machine like models.
-
-    The basic usage (time steps) is:
-
-    .. code-block:: python
-
-       def step(input):
-           output = fc_layer(input=layer,
-                             size=1024,
-                             act=LinearActivation(),
-                             bias_attr=False)
-           return output
-
-       group = recurrent_group(input=layer,
-                               step=step)
-
-    You can see following configs for further usages:
-
-    - time steps: lstmemory_group, paddle/legacy/gserver/tests/sequence_layer_group.conf, \
-                  demo/seqToseq/seqToseq_net.py
-    - sequence steps: paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
-
-    :param step: A step function which takes the input of recurrent_group as its own
-                 input and returns values as recurrent_group's output every time step.
-
-                 The recurrent group scatters a sequence into time steps. And
-                 for each time step, it will invoke step function, and return
-                 a time step result. Then gather outputs of each time step into
-                 layer group's output.
-
-    :type step: callable
-
-    :param name: The recurrent_group's name. It is optional.
-    :type name: basestring
-
-    :param input: Input links array.
-
-                  LayerOutput will be scattered into time steps.
-                  SubsequenceInput will be scattered into sequence steps.
-                  StaticInput will be imported to each time step, and doesn't change
-                  over time. It's a mechanism to access layer outside step function.
-
-    :type input: LayerOutput | StaticInput | SubsequenceInput | list | tuple
-
-    :param reverse: If reverse is set to True, the recurrent unit will process the
-                    input sequence in a reverse order.
-    :type reverse: bool
-
-    :param targetInlink: DEPRECATED.
-                         The input layer which share info with layer group's output
-
-                         Param input specifies multiple input layers. For
-                         SubsequenceInput inputs, config should assign one input
-                         layer that share info(the number of sentences and the number
-                         of words in each sentence) with all layer group's outputs.
-                         targetInlink should be one of the layer group's input.
-
-    :type targetInlink: LayerOutput | SubsequenceInput
-
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    model_type('recurrent_nn')
-
-    if isinstance(input, LayerOutput) or isinstance(input, StaticInput):
-        input = [input]
-    assert isinstance(input, collections.Sequence)
-
-    def is_in_links(x):
-        return isinstance(x, LayerOutput)
-
-    in_links = filter(is_in_links, input)
-
-    RecurrentLayerGroupWithoutOutLinksBegin(
-        name=name,
-        in_links=map(lambda x: x.name, in_links),
-        seq_reversed=reverse)
-    in_args = []
-    for each_input in input:
-        if isinstance(each_input, StaticInput):  # StaticInput
-            mem_name = "__%s_memory__" % each_input.input.name
-            mem = memory(
-                name=None,
-                size=each_input.input.size,
-                boot_layer=each_input.input)
-            mem.set_input(mem)
-            in_args.append(mem)
-        else:
-            in_args.append(each_input)
-
-    layer_outs = step(*in_args)
-
-    if isinstance(layer_outs, LayerOutput):
-        layer_outs = [layer_outs]
-
-    for layer_out in layer_outs:
-        assert isinstance(
-            layer_out, LayerOutput
-        ), "Type of step function's return value must be LayerOutput."
-        layer_out.reverse = reverse
-        RecurrentLayerGroupSetOutLink(layer_out.name)
-
-    RecurrentLayerGroupEnd(name=name)
-
-    for layer_out in layer_outs:
-        # The previous full_name is the name inside the recurrent group.
-        # We need a full_name outside the recurrent group.
-        layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name)
-
-    if len(layer_outs) == 1:
-        return layer_outs[0]
-    else:
-        return layer_outs
-
-
-class BaseGeneratedInput(object):
-    def __init__(self):
-        self.bos_id = None
-        self.eos_id = None
-
-    def before_real_step(self):
-        raise NotImplementedError()
-
-    def after_real_step(self, *args):
-        raise NotImplementedError()
-
-
-class GeneratedInput(BaseGeneratedInput):
-    def after_real_step(self, input):
-        if isinstance(input, LayerOutput):
-            input = [input]
-        elif isinstance(input, collections.Sequence):
-            input = list(input)
-            if len(input) > 1:
-                logger.info(
-                    ("More than one layers inside the recurrent_group "
-                     "are returned as outputs of the entire recurrent_group "
-                     "PLEASE garantee the first output is probability of "
-                     "the predicted next word."))
-
-        return [maxid_layer(
-            input=input[0], name='__beam_search_predict__')] + (
-                input[1:] if len(input) > 1 else [])
-
-    def before_real_step(self):
-        predict_id = memory(
-            name='__beam_search_predict__',
-            size=self.size,
-            boot_with_const_id=self.bos_id)
-
-        trg_emb = embedding_layer(
-            input=predict_id,
-            size=self.embedding_size,
-            param_attr=ParamAttr(name=self.embedding_name))
-        return trg_emb
-
-    def __init__(self, size, embedding_name, embedding_size):
-        super(GeneratedInput, self).__init__()
-        self.size = size
-        self.embedding_name = embedding_name
-        self.embedding_size = embedding_size
-
-
-@wrap_name_default()
-def maxid_layer(input, name=None, layer_attr=None):
-    """
-    A layer for finding the id which has the maximal value for each sample.
-    The result is stored in output.ids.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       maxid = maxid_layer(input=layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput)
-    l = Layer(
-        name=name,
-        type='maxid',
-        inputs=[input.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.MAXID_LAYER,
-        parents=[input],
-        size=l.config.size)
-
-
-@wrap_name_default()
-def dot_prod_layer(input1, input2, name=None, layer_attr=None):
-    """
-    A layer for computing the dot product of two vectors.
-
-    The example usage is:
-
-    .. code-block:: python
-
-        dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input1: The first input layer.
-    :type input1: LayerOutput
-    :param input2: The second input layer.
-    :type input2: LayerOutput
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input1, LayerOutput)
-    assert isinstance(input2, LayerOutput)
-    assert input1.size == input2.size, ("Two inputs should have the same size.")
-
-    l = Layer(
-        name=name,
-        type=LayerType.DOT_PROD_LAYER,
-        inputs=[input1.name, input2.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.DOT_PROD_LAYER,
-        parents=[input1, input2],
-        size=l.config.size)
-
-
-@wrap_name_default()
-def out_prod_layer(input1, input2, name=None, layer_attr=None):
-    """
-    A layer for computing the outer product of two vectors
-    The result is a matrix of size(input1) x size(input2)
-
-    The example usage is:
-
-    .. code-block:: python
-
-       out_prod = out_prod_layer(input1=vec1, input2=vec2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input1: The first input layer.
-    :type input: LayerOutput
-    :param input2: The second input layer.
-    :type input2: LayerOutput
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input1, LayerOutput)
-    assert isinstance(input2, LayerOutput)
-    l = Layer(
-        name=name,
-        type=LayerType.OUT_PROD_LAYER,
-        inputs=[input1.name, input2.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.OUT_PROD_LAYER,
-        parents=[input1, input2],
-        size=l.config.size)
-
-
-@wrap_name_default()
-def eos_layer(input, eos_id, name=None, layer_attr=None):
-    """
-    A layer for checking EOS for each sample:
-    - output_id = (input_id == conf.eos_id)
-
-    The result is stored in output\_.ids.
-    It is used by recurrent layer group.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       eos = eos_layer(input=layer, eos_id=id)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param eos_id: End id of sequence
-    :type eos_id: int
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    l = Layer(
-        name=name,
-        type=LayerType.EOSID_LAYER,
-        eos_id=eos_id,
-        inputs=[input.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.EOSID_LAYER,
-        parents=[input],
-        size=l.config.size)
-
-
-@wrap_name_default()
-def beam_search(step,
-                input,
-                bos_id,
-                eos_id,
-                beam_size,
-                max_length=500,
-                name=None,
-                num_results_per_sample=None):
-    """
-    Beam search is a heuristic search algorithm used in sequence generation.
-    It explores a graph by expanding the most promising nodes in a limited set
-    to maintain tractability.
-
-    The example usage is:
-
-    .. code-block:: python
-
-        def rnn_step(input):
-            last_time_step_output = memory(name='rnn', size=512)
-            with mixed_layer(size=512, name='rnn') as simple_rnn:
-                simple_rnn += full_matrix_projection(input)
-                simple_rnn += last_time_step_output
-            return simple_rnn
-
-        generated_word_embedding = GeneratedInput(
-                               size=target_dictionary_dim,
-                               embedding_name="target_language_embedding",
-                               embedding_size=word_vector_dim)
-
-        beam_gen = beam_search(name="decoder",
-                               step=rnn_step,
-                               input=[StaticInput(encoder_last),
-                                      generated_word_embedding],
-                               bos_id=0,
-                               eos_id=1,
-                               beam_size=5)
-
-    Please see the following demo for more details:
-
-    - machine translation : demo/seqToseq/translation/gen.conf \
-                            demo/seqToseq/seqToseq_net.py
-
-    :param name: The name of the recurrent unit that is responsible for
-                 generating sequences. It is optional.
-    :type name: basestring
-    :param step: A callable function that defines the calculation in a time
-                 step, and it is applied to sequences with arbitrary length by
-                 sharing a same set of weights.
-
-                 You can refer to the first parameter of recurrent_group, or
-                 demo/seqToseq/seqToseq_net.py for more details.
-    :type step: callable
-    :param input: Input data for the recurrent unit, which should include the
-                  previously generated words as a GeneratedInput object.
-                  In beam_search, none of the input's type should be LayerOutput.
-    :type input: list
-    :param bos_id: Index of the start symbol in the dictionary. The start symbol
-                   is a special token for NLP task, which indicates the
-                   beginning of a sequence. In the generation task, the start
-                   symbol is essential, since it is used to initialize the RNN
-                   internal state.
-    :type bos_id: int
-    :param eos_id: Index of the end symbol in the dictionary. The end symbol is
-                   a special token for NLP task, which indicates the end of a
-                   sequence. The generation process will stop once the end
-                   symbol is generated, or a pre-defined max iteration number
-                   is exceeded.
-    :type eos_id: int
-    :param max_length: Max generated sequence length.
-    :type max_length: int
-    :param beam_size: Beam search for sequence generation is an iterative search
-                      algorithm. To maintain tractability, every iteration only
-                      only stores a predetermined number, called the beam_size,
-                      of the most promising next words. The greater the beam
-                      size, the fewer candidate words are pruned.
-    :type beam_size: int
-    :param num_results_per_sample: Number of the generated results per input
-                                  sequence. This number must always be less than
-                                  beam size.
-    :type num_results_per_sample: int
-    :return: The generated word index.
-    :rtype: LayerOutput
-    """
-
-    if num_results_per_sample is None:
-        num_results_per_sample = beam_size
-    if num_results_per_sample > beam_size:
-        logger.warning("num_results_per_sample should be less than beam_size")
-
-    if isinstance(input, StaticInput) or isinstance(input, BaseGeneratedInput):
-        input = [input]
-
-    generated_input_index = -1
-
-    real_input = []
-    for i, each_input in enumerate(input):
-        assert not isinstance(each_input, LayerOutput), (
-            "in beam_search, "
-            "none of the input should has a type of LayerOutput.")
-        if isinstance(each_input, BaseGeneratedInput):
-            assert generated_input_index == -1, ("recurrent_group accepts "
-                                                 "only one GeneratedInput.")
-            generated_input_index = i
-
-        else:
-            real_input.append(each_input)
-
-    assert generated_input_index != -1, "No GeneratedInput is given."
-
-    gipt = input[generated_input_index]
-
-    gipt.bos_id = bos_id
-    gipt.eos_id = eos_id
-
-    def __real_step__(*args):
-        eos_name = "__%s_eos_layer__" % name
-        RecurrentLayerGroupSetGenerator(
-            Generator(
-                eos_layer_name=eos_name,
-                max_num_frames=max_length,
-                beam_size=beam_size,
-                num_results_per_sample=num_results_per_sample))
-
-        args = list(args)
-        args.insert(generated_input_index, gipt.before_real_step())
-
-        predict = gipt.after_real_step(step(*args))
-
-        eos_layer(input=predict[0], eos_id=eos_id, name=eos_name)
-        return predict
-
-    return recurrent_group(
-        step=__real_step__, input=real_input, reverse=False, name=name)
-
-
-def __cost_input__(input, label, weight=None):
-    """
-    inputs and parents for cost layers.
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-    if isinstance(label, LayerOutput):
-        label = [label]
-    ipts = [Input(ipt.name) for ipt in (input + label)]
-    parents = [ipt for ipt in (input + label)]
-    if weight is not None:
-        assert weight.size == 1
-        ipts.append(Input(weight.name))
-        parents.append(weight)
-    return ipts, parents
-
-
-@wrap_name_default()
-@layer_support()
-def square_error_cost(input,
-                      label,
-                      weight=None,
-                      name=None,
-                      coeff=1.0,
-                      layer_attr=None):
-    """
-    sum of square error cost:
-
-    ..  math::
-
-        cost = \\sum_{i=1}^N(t_i-y_i)^2
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    ipts, parents = __cost_input__(input, label, weight)
-
-    Layer(
-        inputs=ipts,
-        type="square_error",
-        name=name,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.COST, parents=parents, size=1)
-
-
-regression_cost = square_error_cost
-
-
-@wrap_name_default("cost")
-@layer_support()
-def classification_cost(input,
-                        label,
-                        weight=None,
-                        name=None,
-                        evaluator=classification_error_evaluator,
-                        layer_attr=None,
-                        coeff=1.):
-    """
-    classification cost Layer.
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param evaluator: Evaluator method. classification_error_evaluator is the default.
-    :type evaluator: Evaluator method
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert input.layer_type != LayerType.DATA
-    assert isinstance(input.activation, SoftmaxActivation)
-    assert label.layer_type == LayerType.DATA
-
-    ipts, parents = __cost_input__(input, label, weight)
-
-    Layer(
-        name=name,
-        type="multi-class-cross-entropy",
-        inputs=ipts,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    def __add_evaluator__(e):
-        assert callable(e)
-        assert hasattr(e, 'is_evaluator')
-        assert isinstance(e.is_evaluator, bool)
-        assert e.is_evaluator
-        assert hasattr(e, "for_classification")
-        assert isinstance(e.for_classification, bool)
-        assert e.for_classification
-
-        e(name=e.__name__, input=input, label=label, weight=weight)
-
-    if not isinstance(evaluator, collections.Sequence):
-        evaluator = [evaluator]
-
-    for each_evaluator in evaluator:
-        __add_evaluator__(each_evaluator)
-
-    return LayerOutput(name, LayerType.COST, parents=parents, size=1)
-
-
-def conv_operator(img,
-                  filter,
-                  filter_size,
-                  num_filters,
-                  num_channels=None,
-                  stride=1,
-                  padding=0,
-                  filter_size_y=None,
-                  stride_y=None,
-                  padding_y=None,
-                  trans=False):
-    """
-    Different from img_conv_layer, conv_op is an Operator, which can be used
-    in mixed_layer. And conv_op takes two inputs to perform convolution.
-    The first input is the image and the second is filter kernel. It only
-    supports GPU mode.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       op = conv_operator(img=input1,
-                          filter=input2,
-                          filter_size=3,
-                          num_filters=64,
-                          num_channels=64)
-
-    :param img: The input image.
-    :type img: LayerOutput
-    :param filter: The input filter.
-    :type filter: LayerOutput
-    :param filter_size: The dimension of the filter kernel on the x axis.
-    :type filter_size: int
-    :param filter_size_y: The dimension of the filter kernel on the y axis.
-                          If the parameter is not set or set to None, it will
-                          set to 'filter_size' automatically.
-    :type filter_size_y: int
-    :param num_filters: The number of the output channels.
-    :type num_filters: int
-    :param num_channels: The number of the input channels. If the parameter is not set
-                         or set to None, it will be automatically set to the channel
-                         number of the 'img'.
-    :type num_channels: int
-    :param stride: The stride on the x axis.
-    :type stride: int
-    :param stride_y: The stride on the y axis. If the parameter is not set or
-                     set to None, it will be set to 'stride' automatically.
-    :type stride_y: int
-    :param padding: The padding size on the x axis.
-    :type padding: int
-    :param padding_y: The padding size on the y axis. If the parameter is not set
-                      or set to None, it will be set to 'padding' automatically.
-    :type padding_y: int
-    :return: A ConvOperator Object.
-    :rtype: ConvOperator
-    """
-    if filter_size_y is None:
-        filter_size_y = filter_size
-    if stride_y is None:
-        stride_y = stride
-    if padding_y is None:
-        padding_y = padding
-
-    if num_channels is None:
-        num_channels = img.num_filters
-
-    assert isinstance(filter, LayerOutput)
-    assert filter.size is not None
-
-    opCls = ConvTransOperator if trans else ConvOperator
-
-    op = opCls(
-        input_layer_names=[img.name, filter.name],
-        num_filters=num_filters,
-        conv_conf=Conv(
-            filter_size=filter_size,
-            padding=padding,
-            stride=stride,
-            channels=num_channels,
-            filter_size_y=filter_size_y,
-            padding_y=padding_y,
-            stride_y=stride_y,
-            groups=1))
-
-    op.origin = [img, filter]
-    return op
-
-
-@wrap_param_attr_default()
-def conv_projection(input,
-                    filter_size,
-                    num_filters,
-                    num_channels=None,
-                    stride=1,
-                    padding=0,
-                    filter_size_y=None,
-                    stride_y=None,
-                    padding_y=None,
-                    groups=1,
-                    param_attr=None,
-                    trans=False):
-    """
-    Different from img_conv_layer and conv_op, conv_projection is a Projection,
-    which can be used in mixed_layer and concat_layer. It uses cudnn to implement
-    convolution and only supports GPU mode.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       proj = conv_projection(input=input1,
-                              filter_size=3,
-                              num_filters=64,
-                              num_channels=64)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param filter_size: The dimensions of the filter kernel. If the parameter is
-                        set to one integer, the two dimensions on x and y axises
-                        will be same when filter_size_y is not set. If it is set
-                        to a list, the first element indicates the dimension on
-                        the x axis, and the second is used to specify the dimension
-                        on the y axis when filter_size_y is not provided.
-    :type filter_size: int | tuple | list
-    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
-                          is not set, it will be set automatically according to filter_size.
-    :type filter_size_y: int
-    :param num_filters: The number of filters.
-    :type num_filters: int
-    :param num_channels: The number of the input channels.
-    :type num_channels: int
-    :param stride: The strides. If the parameter is set to one integer, the strides
-                   on x and y axises will be same when stride_y is not set. If it is
-                   set to a list, the first element indicates the stride on the x axis,
-                   and the second is used to specify the stride on the y axis when
-                   stride_y is not provided.
-    :type stride: int | tuple | list
-    :param stride_y: The stride on the y axis.
-    :type stride_y: int
-    :param padding: The padding sizes. If the parameter is set to one integer, the padding
-                    sizes on x and y axises will be same when padding_y is not set. If it
-                    is set to a list, the first element indicates the padding size on the
-                    x axis, and the second is used to specify the padding size on the y axis
-                    when padding_y is not provided.
-    :type padding: int | tuple | list
-    :param padding_y: The padding size on the y axis.
-    :type padding_y: int
-    :param groups: The group number.
-    :type groups: int
-    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param trans: Whether it is ConvTransProjection or ConvProjection
-    :type trans: bool
-    :return: A Projection Object.
-    :rtype: ConvTransProjection | ConvProjection
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if filter_size_y is None:
-        if isinstance(filter_size, collections.Sequence):
-            assert len(filter_size) == 2
-            filter_size, filter_size_y = filter_size
-        else:
-            filter_size_y = filter_size
-
-    if stride_y is None:
-        if isinstance(stride, collections.Sequence):
-            assert len(stride) == 2
-            stride, stride_y = stride
-        else:
-            stride_y = stride
-
-    if padding_y is None:
-        if isinstance(padding, collections.Sequence):
-            assert len(padding) == 2
-            padding, padding_y = padding
-        else:
-            padding_y = padding
-
-    if param_attr.attr.get('initial_smart'):
-        # special initial for conv layers.
-        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
-        param_attr.attr["initial_mean"] = 0.0
-        param_attr.attr["initial_std"] = init_w
-        param_attr.attr["initial_strategy"] = 0
-        param_attr.attr["initial_smart"] = False
-
-    projCls = ConvTransProjection if trans else ConvProjection
-
-    proj = projCls(
-        input_layer_name=input.name,
-        num_filters=num_filters,
-        conv_conf=Conv(
-            filter_size=filter_size,
-            padding=padding,
-            stride=stride,
-            channels=num_channels,
-            filter_size_y=filter_size_y,
-            padding_y=padding_y,
-            stride_y=stride_y,
-            groups=groups),
-        **param_attr.attr)
-
-    proj.origin = input
-    return proj
-
-
-@wrap_name_default("pad")
-@layer_support()
-def pad_layer(input,
-              pad_c=None,
-              pad_h=None,
-              pad_w=None,
-              name=None,
-              layer_attr=None):
-    """
-    This operation pads zeros to the input data according to pad_c,pad_h
-    and pad_w. pad_c, pad_h, pad_w specify the size in the corresponding
-    dimension. And the input data shape is NCHW.
-
-    For example, pad_c=[2,3] means padding 2 zeros before the input data
-    and 3 zeros after the input data in the channel dimension. pad_h means
-    padding zeros in the height dimension. pad_w means padding zeros in the
-    width dimension.
-
-    For example,
-
-    .. code-block:: python
-
-       input(2,2,2,3)  = [
-                           [ [[1,2,3], [3,4,5]],
-                             [[2,3,5], [1,6,7]] ],
-                           [ [[4,3,1], [1,8,7]],
-                             [[3,8,9], [2,3,5]] ]
-                         ]
-
-       pad_c=[1,1], pad_h=[0,0], pad_w=[0,0]
-
-       output(2,4,2,3) = [
-                           [ [[0,0,0], [0,0,0]],
-                             [[1,2,3], [3,4,5]],
-                             [[2,3,5], [1,6,7]],
-                             [[0,0,0], [0,0,0]] ],
-                           [ [[0,0,0], [0,0,0]],
-                             [[4,3,1], [1,8,7]],
-                             [[3,8,9], [2,3,5]],
-                             [[0,0,0], [0,0,0]] ]
-                         ]
-
-    The simply usage is:
-
-    .. code-block:: python
-
-       pad = pad_layer(input=ipt,
-                       pad_c=[4,4],
-                       pad_h=[0,0],
-                       pad_w=[2,2])
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param pad_c: The padding size in the channel dimension.
-    :type pad_c: list | None
-    :param pad_h: The padding size in the height dimension.
-    :type pad_h: list | None
-    :param pad_w: The padding size in the width dimension.
-    :type pad_w: list | None
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if pad_c is not None:
-        assert isinstance(pad_c, collections.Sequence) and len(pad_c) == 2
-    else:
-        pad_c = [0, 0]
-
-    if pad_h is not None:
-        assert isinstance(pad_h, collections.Sequence) and len(pad_h) == 2
-    else:
-        pad_h = [0, 0]
-
-    if pad_w is not None:
-        assert isinstance(pad_w, collections.Sequence) and len(pad_w) == 2
-    else:
-        pad_w = [0, 0]
-
-    assert input.num_filters is not None
-    in_ch = input.num_filters
-    out_ch = in_ch + pad_c[0] + pad_c[1]
-
-    l = Layer(
-        name=name,
-        type=LayerType.PAD_LAYER,
-        inputs=Input(
-            input.name,
-            pad=Pad(
-                channels=in_ch,
-                pad_c=pad_c,
-                pad_h=pad_h,
-                pad_w=pad_w, )),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        layer_type=LayerType.PAD_LAYER,
-        parents=[input],
-        num_filters=out_ch,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def conv_shift_layer(a, b, name=None, layer_attr=None):
-    """
-    This layer performs cyclic convolution on two inputs. For example:
-      - a[in]: contains M elements.
-      - b[in]: contains N elements (N should be odd).
-      - c[out]: contains M elements.
-
-    .. math::
-
-        c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
-
-    In this formula:
-     - a's index is computed modulo M. When it is negative, then get item from
-       the right side (which is the end of array) to the left.
-     - b's index is computed modulo N. When it is negative, then get item from
-       the right size (which is the end of array) to the left.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       conv_shift = conv_shift_layer(a=layer1, b=layer2)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param a: The first input of this layer.
-    :type a: LayerOutput
-    :param b: The second input of this layer.
-    :type b: LayerOutput
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
-    assert b.size is None or b.size % 2 == 1  # size of b must be odd.
-    Layer(
-        name=name,
-        type=LayerType.CONV_SHIFT_LAYER,
-        inputs=[a.name, b.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name, LayerType.CONV_SHIFT_LAYER, parents=[a, b], size=a.size)
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default(act=LinearActivation())
-@layer_support(ERROR_CLIPPING, DROPOUT)
-def tensor_layer(a,
-                 b,
-                 size,
-                 act=None,
-                 name=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 layer_attr=None):
-    """
-    This layer performs tensor operation on two inputs.
-    For example:
-
-    .. math::
-       y_{i} = a * W_{i} * {b^\mathrm{T}}, i=0,1,...,K-1
-
-    In this formular:
-      - :math:`a`: the first input contains M elements.
-      - :math:`b`: the second input contains N elements.
-      - :math:`y_{i}`: the i-th element of y.
-      - :math:`W_{i}`: the i-th learned weight, shape if [M, N]
-      - :math:`b^\mathrm{T}`: the transpose of :math:`b_{2}`.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       tensor = tensor_layer(a=layer1, b=layer2, size=1000)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param a: The first input of this layer.
-    :type a: LayerOutput
-    :param b: The second input of this layer.
-    :type b: LayerOutput
-    :param size: The dimension of this layer.
-    :type size: int
-    :param act: Activation type. LinearActivation is the default activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute,
-                      no bias is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
-    Layer(
-        name=name,
-        size=size,
-        type=LayerType.TENSOR_LAYER,
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr),
-        inputs=[Input(a.name, **param_attr.attr), Input(b.name)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.TENSOR_LAYER, parents=[a, b], activation=act, size=size)
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default()
-@layer_support(DROPOUT, ERROR_CLIPPING)
-def selective_fc_layer(input,
-                       size,
-                       select=None,
-                       act=None,
-                       name=None,
-                       pass_generation=False,
-                       has_selected_colums=True,
-                       mul_ratio=0.02,
-                       param_attr=None,
-                       bias_attr=None,
-                       layer_attr=None):
-    """
-    Selectived fully connected layer. Different from fc_layer, the output
-    of this layer can be sparse. It requires an additional input to indicate
-    several selected columns for output. If the selected columns is not
-    specified, selective_fc_layer acts exactly like fc_layer.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       sel_fc = selective_fc_layer(input=input, size=128, act=TanhActivation())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput | list | tuple
-    :param select: The layer to select columns to output. It should be a sparse
-                   binary matrix, and is treated as the mask of selective fc. If
-                   it is not set or set to None, selective_fc_layer acts exactly
-                   like fc_layer.
-    :type select: LayerOutput
-    :param size: The dimension of this layer, which should be equal to that of
-                 the layer 'select'.
-    :type size: int
-    :param act: Activation type. TanhActivation is the default activation.
-    :type act: BaseActivation
-    :param pass_generation: The flag which indicates whether it is during generation.
-    :type pass_generation: bool
-    :param has_selected_colums: The flag which indicates whether the parameter 'select'
-                                has been set. True is the default.
-    :type has_selected_colums: bool
-    :param mul_ratio: A ratio helps to judge how sparse the output is and determine
-                      the computation method for speed consideration.
-    :type mul_ratio: float
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute,
-                      no bias is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-        assert not isinstance(param_attr, collections.Sequence)
-        param_attr = [param_attr]
-    else:
-        if isinstance(param_attr, collections.Sequence):
-            assert len(input) == len(param_attr)
-        else:
-            if "parameter_name" in param_attr.attr and len(input) > 1:
-                logger.fatal(
-                    "When the name field of param_attr is manually specified "
-                    "and the input is a list, the param_attr should also be a "
-                    "list with each item being the param_attr for each input "
-                    "item. If only one named param_attr is provided, all the "
-                    "input items would share this parameter.")
-            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
-
-    assert isinstance(input, collections.Sequence)
-    assert isinstance(select, LayerOutput)
-    if select.size is not None:
-        assert select.size == size
-    Layer(
-        inputs=[
-            Input(ipt.name, **attr.attr) for ipt, attr in zip(input, param_attr)
-        ] + [select.name],
-        name=name,
-        type=LayerType.SEL_FC_LAYER,
-        size=size,
-        bias=ParameterAttribute.to_bias(bias_attr),
-        active_type=act.name,
-        selective_fc_pass_generation=pass_generation,
-        has_selected_colums=has_selected_colums,
-        selective_fc_full_mul_ratio=mul_ratio,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.SEL_FC_LAYER,
-        list(input) + [select],
-        activation=act,
-        size=size)
-
-
-@wrap_name_default()
-@layer_support()
-def sampling_id_layer(input, name=None, layer_attr=None):
-    """
-    A layer for sampling id from a multinomial distribution from the input layer.
-    Sampling one id for one sample.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       samping_id = sampling_id_layer(input=input)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    l = Layer(
-        name=name,
-        type=LayerType.SAMPLING_ID_LAYER,
-        inputs=[Input(input.name)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SAMPLING_ID_LAYER, input, size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def slope_intercept_layer(input,
-                          name=None,
-                          slope=1.0,
-                          intercept=0.0,
-                          layer_attr=None):
-    """
-    This layer for applying a slope and an intercept to the input.
-
-    ..  math::
-        y = slope * x + intercept
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       scale = slope_intercept_layer(input=input, slope=-1.0, intercept=1.0)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param slope: The scale factor.
-    :type slope: float
-    :param intercept: The offset.
-    :type intercept: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.SLOPE_INTERCEPT_LAYER,
-        slope=slope,
-        intercept=intercept,
-        inputs=[Input(input.name)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SLOPE_INTERCEPT_LAYER, input, size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def linear_comb_layer(weights, vectors, size=None, name=None, layer_attr=None):
-    """
-    A layer for weighted sum of vectors takes two inputs.
-      - Input: size of weights is M
-               size of vectors is M*N
-      - Output: a vector of size=N
-
-    .. math::
-
-       z(i) = \sum_{j=0}^{M-1} x(j) y(i+Nj)
-
-    where :math:`0 \le i \le N-1`
-
-    Or in the matrix notation:
-
-    .. math::
-
-       z = x^\mathrm{T} Y
-
-    In this formular:
-      - :math:`x`: weights
-      - :math:`y`: vectors.
-      - :math:`z`: the output.
-
-    Note that the above computation is for one sample. Multiple samples are
-    processed in one batch.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       linear_comb = linear_comb_layer(weights=weight, vectors=vectors,
-                                       size=elem_dim)
-
-    :param weights: The weight layer.
-    :type weights: LayerOutput
-    :param vectors: The vector layer.
-    :type vectors: LayerOutput
-    :param size: The dimension of this layer.
-    :type size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(weights, LayerOutput) and isinstance(vectors, LayerOutput)
-    if vectors.size is not None and weights.size is not None:
-        assert vectors.size % weights.size == 0
-        if size is None:
-            size = vectors.size / weights.size
-        else:
-            assert size == vectors.size / weights.size
-    Layer(
-        name=name,
-        type=LayerType.LINEAR_COMBINATION_LAYER,
-        size=size,
-        inputs=[Input(weights.name), Input(vectors.name)],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.LINEAR_COMBINATION_LAYER, [weights, vectors], size=size)
-
-
-convex_comb_layer = linear_comb_layer
-
-
-@wrap_name_default()
-@layer_support()
-def block_expand_layer(input,
-                       block_x=0,
-                       block_y=0,
-                       stride_x=0,
-                       stride_y=0,
-                       padding_x=0,
-                       padding_y=0,
-                       num_channels=None,
-                       name=None,
-                       layer_attr=None):
-    """
-    Expand feature map to minibatch matrix.
-       - matrix width is: block_y * block_x * num_channels
-       - matirx height is: outputH * outputW
-
-    .. math::
-
-       outputH = 1 + (2 * padding_y + imgSizeH - block_y + stride_y - 1) / stride_y
-
-       outputW = 1 + (2 * padding_x + imgSizeW - block_x + stride_x - 1) / stride_x
-
-    The expanding method is the same with ExpandConvLayer, but saved the transposed
-    value. After expanding, output.sequenceStartPositions will store timeline.
-    The number of time steps is outputH * outputW and the dimension of each
-    time step is block_y * block_x * num_channels. This layer can be used after
-    convolutional neural network, and before recurrent neural network.
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       block_expand = block_expand_layer(input=layer,
-                                         num_channels=128,
-                                         stride_x=1,
-                                         stride_y=1,
-                                         block_x=1,
-                                         block_x=3)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param block_x: The width of sub block.
-    :type block_x: int
-    :param block_y: The width of sub block.
-    :type block_y: int
-    :param stride_x: The stride size in horizontal direction.
-    :type stride_x: int
-    :param stride_y: The stride size in vertical direction.
-    :type stride_y: int
-    :param padding_x: The padding size in horizontal direction.
-    :type padding_x: int
-    :param padding_y: The padding size in vertical direction.
-    :type padding_y: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring.
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name,
-            block_expand=BlockExpand(
-                channels=num_channels,
-                block_x=block_x,
-                block_y=block_y,
-                stride_x=stride_x,
-                stride_y=stride_y,
-                padding_x=padding_x,
-                padding_y=padding_y)),
-        type=LayerType.BLOCK_EXPAND,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name, LayerType.BLOCK_EXPAND, parents=[input], size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
-    """
-    A layer to do max out on convolutional layer output.
-      - Input: the output of a convolutional layer.
-      - Output: feature map size same as the input's, and its channel number is
-        (input channel) / groups.
-
-    So groups should be larger than 1, and the num of channels should be able
-    to be devided by groups.
-
-    Reference:
-        `Maxout Networks
-        <http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf>`_
-        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        <https://arxiv.org/pdf/1312.6082v4.pdf>`_
-
-
-    .. math::
-
-       & out = \max_k (in[n, k, o_c , s])
-
-       & out_{i * s + j} = \max_k in_{  k * o_{c} * s + i * s + j}
-
-       & s = \\frac{input.size}{ num\_channels}
-
-       & o_{c} = \\frac{num\_channels}{groups}
-
-       & 0 \le i < o_{c}
-
-       & 0 \le j < s
-
-       & 0 \le k < groups
-
-
-    The simple usage is:
-
-    .. code-block:: python
-
-       maxout = maxout_layer(input,
-                             num_channels=128,
-                             groups=4)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param groups: The group number of input layer.
-    :type groups: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input.activation, LinearActivation)
-    assert groups > 1
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-    assert num_channels % groups == 0
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name, maxout=MaxOut(
-                channels=num_channels, groups=groups)),
-        type=LayerType.MAXOUT,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.MAXOUT, parents=[input], size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def ctc_layer(input,
-              label,
-              size=None,
-              name=None,
-              norm_by_times=False,
-              layer_attr=None):
-    """
-    Connectionist Temporal Classification (CTC) is designed for temporal
-    classication task. e.g. sequence labeling problems where the
-    alignment between the inputs and the target labels is unknown.
-
-    Reference:
-        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
-        with Recurrent Neural Networks
-        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
-
-    Note:
-        Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
-        as the size of the input, where num_classes is the category number.
-        And the 'blank' is the last category index. So the size of 'input' layer (e.g.
-        fc_layer with softmax activation) should be (num_classes + 1). The size of
-        ctc_layer should also be (num_classes + 1).
-
-    The example usage is:
-
-    .. code-block:: python
-
-      ctc = ctc_layer(input=input,
-                      label=label,
-                      size=9055,
-                      norm_by_times=True)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param size: The dimension of this layer, which must be equal to (category number + 1).
-    :type size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param norm_by_times: Whether to do normalization by times. False is the default.
-    :type norm_by_times: bool
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert isinstance(label, LayerOutput)
-    if label.size is not None:
-        if size is not None:
-            assert size == label.size + 1
-        else:
-            size = label.size + 1
-    Layer(
-        name=name,
-        type=LayerType.CTC_LAYER,
-        size=size,
-        norm_by_times=norm_by_times,
-        inputs=[input.name, label.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
-
-
-@wrap_name_default()
-@layer_support()
-def warp_ctc_layer(input,
-                   label,
-                   size=None,
-                   name=None,
-                   blank=0,
-                   norm_by_times=False,
-                   layer_attr=None):
-    """
-    A layer intergrating the open-source `warp-ctc
-    <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
-    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
-    <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
-    Classification (CTC) loss. Besides, another `warp-ctc repository
-    <https://github.com/gangliao/warp-ctc>`_ , which is forked from
-    the official one, is maintained to enable more compiling options. During the
-    building process, PaddlePaddle will clone the source codes, build and
-    install it to :code:`third_party/install/warpctc` directory.
-
-    Reference:
-        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
-        with Recurrent Neural Networks
-        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
-
-    Note:
-        - Let num_classes represents the category number. Considering the 'blank'
-          label needed by CTC, you need to use (num_classes + 1) as the size of
-          warp_ctc layer.
-        - You can set 'blank' to any value ranged in [0, num_classes], which
-          should be consistent with those used in your labels.
-        - As a native 'softmax' activation is interated to the warp-ctc library,
-          'linear' activation is expected to be used instead in the 'input' layer.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      ctc = warp_ctc_layer(input=input,
-                           label=label,
-                           size=1001,
-                           blank=1000,
-                           norm_by_times=False)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param size: The dimension of this layer, which must be equal to (category number + 1).
-    :type size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param blank: The 'blank' label used in ctc.
-    :type blank: int
-    :param norm_by_times: Whether to do normalization by times. False is the default.
-    :type norm_by_times: bool
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert isinstance(label, LayerOutput)
-    if label.size is not None:
-        if size is not None:
-            assert size == label.size + 1
-        else:
-            size = label.size + 1
-    Layer(
-        name=name,
-        type=LayerType.WARP_CTC_LAYER,
-        size=size,
-        blank=blank,
-        norm_by_times=norm_by_times,
-        inputs=[input.name, label.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size)
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@layer_support()
-def crf_layer(input,
-              label,
-              size=None,
-              weight=None,
-              param_attr=None,
-              name=None,
-              coeff=1.0,
-              layer_attr=None):
-    """
-    A layer for calculating the cost of sequential conditional random
-    field model.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      crf = crf_layer(input=input,
-                      label=label,
-                      size=label_dim)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type label: LayerOutput
-    :param size: The category number.
-    :type size: int
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert isinstance(label, LayerOutput)
-    assert weight is None or isinstance(weight, LayerOutput)
-    if input.size is not None and label.size is not None:
-        assert input.size == label.size
-        if size is None:
-            size = input.size
-        else:
-            assert size == input.size
-
-    ipts = [Input(input.name, **param_attr.attr), Input(label.name)]
-    if weight is not None:
-        ipts.append(Input(weight.name))
-
-    Layer(
-        name=name,
-        type=LayerType.CRF_LAYER,
-        size=size,
-        inputs=ipts,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    parents = [input, label]
-    if weight is not None:
-        parents.append(weight)
-    # The size for LayerOutput means the dimension of the output.
-    # It's different from the meaning of crf layer, which is the number of
-    # classes.
-    return LayerOutput(name, LayerType.CRF_LAYER, parents, size=1)
-
-
-@wrap_name_default()
-@wrap_param_attr_default()
-@layer_support()
-def crf_decoding_layer(input,
-                       size,
-                       label=None,
-                       param_attr=None,
-                       name=None,
-                       layer_attr=None):
-    """
-    A layer for calculating the decoding sequence of sequential conditional
-    random field model. The decoding sequence is stored in output.ids.
-    If the input 'label' is provided, it is treated as the ground-truth label, and
-    this layer will also calculate error. output.value[i] is 1 for an incorrect
-    decoding and 0 for the correct.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      crf_decoding = crf_decoding_layer(input=input,
-                                        size=label_dim)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param size: The dimension of this layer.
-    :type size: int
-    :param label: The input label.
-    :type label: LayerOutput | None
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput)
-    assert label is None or isinstance(label, LayerOutput)
-
-    ipts = [Input(input.name, **param_attr.attr)]
-    if label is not None:
-        ipts.append(Input(label.name))
-
-    Layer(
-        name=name,
-        type=LayerType.CRF_DECODING_LAYER,
-        size=size,
-        inputs=ipts,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    parents = [input]
-    if label is not None:
-        parents.append(label)
-    # The size for LayerOutput means the dimension of the output.
-    # It's different from the meaning of crf layer, which is the number of
-    # classes.
-    return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
-
-
-"""
-Following are cost Layers.
-"""
-
-
-@wrap_bias_attr_default(has_bias=True)
-@wrap_param_attr_default()
-@wrap_name_default()
-@layer_support()
-def nce_layer(input,
-              label,
-              num_classes=None,
-              param_attr=None,
-              weight=None,
-              num_neg_samples=10,
-              neg_distribution=None,
-              name=None,
-              bias_attr=None,
-              layer_attr=None):
-    """
-    Noise-contrastive estimation.
-
-    Reference:
-        `A fast and simple algorithm for training neural probabilistic language
-        models. <https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf>`_
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = nce_layer(input=[layer1, layer2], label=layer2,
-                        param_attr=[attr1, attr2], weight=layer3,
-                        num_classes=3, neg_distribution=[0.1,0.3,0.6])
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The first input of this layer.
-    :type input: LayerOutput | list | tuple | collections.Sequence
-    :param label: The input label.
-    :type label: LayerOutput
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param num_classes: The number of classes.
-    :type num_classes: int
-    :param act: Activation type. SigmoidActivation is the default activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param num_neg_samples: The number of sampled negative labels. 10 is the
-                            default value.
-    :type num_neg_samples: int
-    :param neg_distribution: The discrete noisy distribution over the output
-                             space from which num_neg_samples negative labels
-                             are sampled. If this parameter is not set, a
-                             uniform distribution will be used. A user-defined
-                             distribution is a list whose length must be equal
-                             to the num_classes. Each member of the list defines
-                             the probability of a class given input x.
-    :type neg_distribution: list | tuple | collections.Sequence | None
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      False or an object whose type is not ParameterAttribute,
-                      no bias is defined. If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-        assert not isinstance(param_attr, collections.Sequence)
-        param_attr = [param_attr]
-    else:
-        if isinstance(param_attr, collections.Sequence):
-            assert len(input) == len(param_attr)
-        else:
-            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
-
-    assert isinstance(input, collections.Sequence)
-
-    assert isinstance(label, LayerOutput)
-    assert label.layer_type == LayerType.DATA
-    if num_classes is None:
-        num_classes = label.size
-    if neg_distribution is not None:
-        assert isinstance(neg_distribution, collections.Sequence)
-        assert len(neg_distribution) == num_classes
-        assert abs(sum(neg_distribution) - 1.0) < 1e-5
-
-    ipts_for_layer = []
-    parents = []
-    for each_input, attr in zip(input, param_attr):
-        assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(Input(each_input.name, **attr.attr))
-        parents.append(each_input)
-    ipts_for_layer.append(label.name)
-    parents.append(label)
-
-    if weight is not None:
-        assert isinstance(weight, LayerOutput)
-        assert weight.layer_type == LayerType.DATA
-        ipts_for_layer.append(weight.name)
-        parents.append(weight)
-
-    l = Layer(
-        name=name,
-        type=LayerType.NCE_LAYER,
-        num_classes=num_classes,
-        neg_sampling_dist=neg_distribution,
-        active_type=SigmoidActivation().name,
-        num_neg_samples=num_neg_samples,
-        inputs=ipts_for_layer,
-        bias=ParamAttr.to_bias(bias_attr),
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.NCE_LAYER,
-        parents=parents,
-        size=l.config.size,
-        activation=SigmoidActivation())
-
-
-@wrap_name_default()
-@layer_support()
-def rank_cost(left,
-              right,
-              label,
-              weight=None,
-              name=None,
-              coeff=1.0,
-              layer_attr=None):
-    """
-    A cost Layer for learning to rank using gradient descent.
-
-    Reference:
-        `Learning to Rank using Gradient Descent
-        <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_
-
-    .. math::
-
-       C_{i,j} & = -\\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}})
-
-       o_{i,j} & =  o_i - o_j
-
-       \\tilde{P_{i,j}} & = \\{0, 0.5, 1\\} \ or \ \\{0, 1\\}
-
-    In this formula:
-      - :math:`C_{i,j}` is the cross entropy cost.
-      - :math:`\\tilde{P_{i,j}}` is the label. 1 means positive order
-        and 0 means reverse order.
-      - :math:`o_i` and :math:`o_j`: the left output and right output.
-        Their dimension is one.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      cost = rank_cost(left=out_left,
-                       right=out_right,
-                       label=label)
-
-    :param left: The first input, the size of this layer is 1.
-    :type left: LayerOutput
-    :param right: The right input, the size of this layer is 1.
-    :type right: LayerOutput
-    :param label: Label is 1 or 0, means positive order and reverse order.
-    :type label: LayerOutput
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert left.size == 1
-    assert right.size == 1
-    assert label.size == 1
-
-    ipts = [left.name, right.name, label.name]
-    parents = [left, right, label]
-    if weight is not None:
-        ipts.append(weight.name)
-        parents.append(weight)
-
-    Layer(
-        name=name,
-        type=LayerType.RANK_COST,
-        inputs=ipts,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(name, LayerType.RANK_COST, parents=parents, size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def lambda_cost(input,
-                score,
-                name,
-                NDCG_num=5,
-                max_sort_size=-1,
-                layer_attr=None):
-    """
-    lambdaCost for lambdaRank LTR approach.
-
-    The example usage is:
-
-    .. code-block:: python
-
-      cost = lambda_cost(input=input,
-                         score=score,
-                         NDCG_num=8,
-                         max_sort_size=-1)
-
-    :param input: The first input of this layer, which is often a document
-                  samples list of the same query and whose type must be sequence.
-    :type input: LayerOutput
-    :param score: The scores of the samples.
-    :type input: LayerOutput
-    :param NDCG_num: The size of NDCG (Normalized Discounted Cumulative Gain),
-                     e.g., 5 for NDCG@5. It must be less than or equal to the
-                     minimum size of the list.
-    :type NDCG_num: int
-    :param max_sort_size: The size of partial sorting in calculating gradient. If
-                          max_sort_size is equal to -1 or greater than the number
-                          of the samples in the list, then the algorithm will sort
-                          the entire list to compute the gradient. In other cases,
-                          max_sort_size must be greater than or equal to NDCG_num.
-    :type max_sort_size: int
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput) and isinstance(score, LayerOutput)
-    if score.size is not None:
-        assert score.size == 1
-    Layer(
-        name=name,
-        type=LayerType.LAMBDA_COST,
-        inputs=[input.name, score.name],
-        NDCG_num=NDCG_num,
-        max_sort_size=max_sort_size,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name, LayerType.LAMBDA_COST, parents=[input, score], size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def cross_entropy(input,
-                  label,
-                  name=None,
-                  coeff=1.0,
-                  weight=None,
-                  layer_attr=None):
-    """
-    A loss layer for multi class entropy.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = cross_entropy(input=input_layer,
-                            label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput.
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. It is optional.
-    :type weight: LayerOutout
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    ipts, parents = __cost_input__(input, label, weight)
-    Layer(
-        name=name,
-        type=LayerType.CROSS_ENTROPY,
-        inputs=ipts,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def cross_entropy_with_selfnorm(input,
-                                label,
-                                name=None,
-                                coeff=1.0,
-                                softmax_selfnorm_alpha=0.1,
-                                layer_attr=None):
-    """
-    A loss layer for multi class entropy with selfnorm.
-    Input should be a vector of positive numbers, without normalization.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = cross_entropy_with_selfnorm(input=input_layer,
-                                          label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param softmax_selfnorm_alpha: The scale factor affects the cost.
-    :type softmax_selfnorm_alpha: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.CROSS_ENTROPY_WITH_SELFNORM,
-        inputs=[input.name, label.name],
-        coeff=coeff,
-        softmax_selfnorm_alpha=softmax_selfnorm_alpha,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.CROSS_ENTROPY_WITH_SELFNORM,
-        parents=[input, label],
-        size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def sum_cost(input, name=None, layer_attr=None):
-    """
-    A loss layer which calculates the sum of the input as loss.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = sum_cost(input=input_layer)
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput.
-    """
-    assert isinstance(input, LayerOutput)
-    Layer(
-        name=name,
-        type=LayerType.SUM_COST,
-        inputs=[input.name],
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-
-    return LayerOutput(name, LayerType.SUM_COST, parents=[input], size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def huber_regression_cost(input,
-                          label,
-                          name=None,
-                          delta=1.0,
-                          coeff=1.0,
-                          layer_attr=None):
-    """
-    In statistics, the Huber loss is a loss function used in robust regression,
-    that is less sensitive to outliers in data than the squared error loss.
-    Given a prediction f(x), a label y and :math:`\delta`, the loss function
-    is defined as:
-
-    .. math::
-
-       loss = 0.5*(y-f(x))^{2}, | y-f(x) | < \delta
-
-       loss = \delta | y-f(x) | - 0.5 \delta ^2, otherwise
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = huber_regression_cost(input=input_layer, label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param delta: The difference between the observed and predicted values.
-    :type delta: float
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput.
-    """
-    assert isinstance(input, LayerOutput)
-    Layer(
-        name=name,
-        type=LayerType.HUBER_REGRESSION,
-        inputs=[input.name, label.name],
-        delta=delta,
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.HUBER_REGRESSION, parents=[input, label], size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def huber_classification_cost(input,
-                              label,
-                              name=None,
-                              coeff=1.0,
-                              layer_attr=None):
-    """
-    For classification purposes, a variant of the Huber loss called modified Huber
-    is sometimes used. Given a prediction f(x) (a real-valued classifier score) and
-    a true binary class label :math:`y\in \{-1, 1 \}`, the modified Huber
-    loss is defined as:
-
-    .. math:
-
-       loss = \max ( 0, 1-yf(x) )^2, yf(x) \geq -1
-
-       loss = -4yf(x), otherwise
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = huber_classification_cost(input=input_layer, label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    if input.size is not None:
-        assert input.size == 1
-    Layer(
-        name=name,
-        type=LayerType.HUBER_CLASSIFICATION,
-        inputs=[input.name, label.name],
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.HUBER_CLASSIFICATION, parents=[input, label], size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def multi_binary_label_cross_entropy(input,
-                                     label,
-                                     name=None,
-                                     coeff=1.0,
-                                     layer_attr=None):
-    """
-    A loss layer for multi binary label cross entropy.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = multi_binary_label_cross_entropy(input=input_layer,
-                                               label=label_layer)
-
-    :param input: The first input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if input.activation is None or \
-            not isinstance(input.activation, SigmoidActivation):
-        logger.log(logging.WARN,
-                   ("%s is not a recommended activation for "
-                    "multi_binary_label_cross_entropy, sigmoid is better") %
-                   repr(input.activation))
-
-    Layer(
-        name=name,
-        type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
-        inputs=[input.name, label.name],
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
-        parents=[input, label],
-        size=1)
-
-
-class BeamInput(object):
-    """
-    Define the input for cross_entropy_over_beam layer.
-
-    A beam is made up of a triple: the first one is scores over all
-    candidates; the second one is indices of top k selected candidates; the
-    third one is the index of ground truth, which is also always called
-    gold.
-    """
-
-    def __init__(self, candidate_scores, selected_candidates, gold):
-        assert isinstance(candidate_scores, LayerOutput)
-        self.candidate_scores = candidate_scores
-        assert candidate_scores.size == 1
-
-        assert isinstance(selected_candidates, LayerOutput)
-        self.selected_candidates = selected_candidates
-
-        assert isinstance(gold, LayerOutput)
-        self.gold = gold
-
-
-@wrap_name_default()
-@layer_support()
-def cross_entropy_over_beam(input, name=None):
-    """
-    This layer is used in learning to search models, which is to solve complex
-    joint prediction problems based on learning to search through a
-    problem-defined search space.
-
-    Specifically, the learning to search process for this layer begins with
-    searching a target sequence from a nested sequence. In the first search
-    step, top beam size sequences with highest scores, indices of these top k
-    sequences in the original nested sequence, and the ground truth (also
-    called gold) altogether (a triple) make up of the first beam.
-
-    Then, several special positions, for example, start and end positions
-    that define meaningful segments are searched. In these searches, top k
-    positions with highest scores are selected, and then sequence, starting
-    from the selected starts till ends of the sequences (or a fixed position)
-    are taken to search next.
-
-    We call the possible top k results returned in one search the beam. This
-    search process can be repeated for pre-defined turns and leads to several
-    beam expansions.
-
-    Finally, the layer cross_entropy_over_beam takes all the beam expansions
-    which contain several candidate targets found along the multi-step search.
-    cross_entropy_over_beam calculates cross entropy over the expanded beams
-    which all the candidates in the beam as the normalized factor.
-
-    Note that, if gold falls off the beam at search step t, then the cost is
-    calculated over the beam at step t.
-
-    This cost layer always works together with kmax_seq_score_layer,
-    sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a
-    sub-search space.
-
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = cross_entropy_over_beam(input=[
-           BeamInput(
-               candidate_scores=beam1_candidates,
-               selected_candidates=beam1_topk,
-               gold=gold1),
-           BeamInput(
-               candidate_scores=beam2_candidates,
-               selected_candidates=beam2_topk,
-               gold=gold2),
-       ])
-
-
-    :param input: Input beams for this layer.
-    :type input: BeamInput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    if isinstance(input, BeamInput):
-        input = [input]
-    else:
-        assert isinstance(input, list), (
-            'input for cross_entropy_over_beam shold be a python list '
-            'of BeamInput object.')
-        for ipt in input:
-            assert isinstance(ipt, BeamInput), (
-                'input for cross_entropy_over_beam '
-                'should be a BeamInput object.')
-
-    ipts = []
-    parents = []
-    for beam in input:
-        parents += [beam.candidate_scores, beam.selected_candidates, beam.gold]
-        ipts += [
-            beam.candidate_scores.name, beam.selected_candidates.name,
-            beam.gold.name
-        ]
-
-    Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts)
-    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
-
-
-@wrap_name_default()
-@layer_support()
-def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
-    """
-    This is a L1 loss but more smooth. It requires that the
-    sizes of input and label are equal. The formula is as follows,
-
-    .. math::
-
-        L = \sum_{i} smooth_{L1}(input_i - label_i)
-
-    in which
-
-    .. math::
-
-        smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
-
-    Reference:
-        `Fast R-CNN
-        <https://arxiv.org/pdf/1504.08083v2.pdf>`_
-
-    The example usage is:
-
-    .. code-block:: python
-
-       cost = smooth_l1_cost(input=input_layer,
-                             label=label_layer)
-
-    :param input: The input layer.
-    :type input: LayerOutput
-    :param label: The input label.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default value.
-    :type coeff: float
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert isinstance(label, LayerOutput)
-    assert input.size == label.size
-
-    Layer(
-        name=name,
-        type=LayerType.SMOOTH_L1,
-        inputs=[input.name, label.name],
-        coeff=coeff,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.SMOOTH_L1, parents=[input, label], size=1)
-
-
-@wrap_name_default()
-def multiplex_layer(input, name=None, layer_attr=None):
-    """
-    This layer multiplex multiple layers according to the indexes,
-    which are provided by the first input layer.
-    inputs[0]: the indexes of the layers to form the output of size batchSize.
-    inputs[1:N]; the candidate output data.
-    For each index i from 0 to batchSize - 1, the i-th row of the output is the
-    the same to the i-th row of the (index[i] + 1)-th layer.
-
-    For each i-th row of output:
-    .. math::
-        y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
-
-    where, y is output. :math:`x_{k}` is the k-th input layer and
-    :math:`k = x_{0}[i] + 1`.
-
-    The example usage is:
-
-    .. code-block:: python
-
-       maxid = multiplex_layer(input=layers)
-
-    :param input: Input layers.
-    :type input: list of LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute.
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, collections.Sequence)
-    assert len(input) > 2, 'multiplex_layer should have more than 2 inputs'
-    for i in range(1, len(input)):
-        assert isinstance(input[i], LayerOutput)
-        assert input[i].size == input[1].size, \
-            "All the input layers except the first one should have the same size"
-
-    l = Layer(
-        name=name,
-        type='multiplex',
-        inputs=[x.name for x in input],
-        size=input[1].size,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.MULTIPLEX_LAYER,
-        parents=input,
-        size=l.config.size)
-
-
-@wrap_name_default("dropout")
-def dropout_layer(input, dropout_rate, name=None):
-    """
-
-    The example usage is:
-
-    .. code-block:: python
-
-        dropout = dropout_layer(input=input_layer, dropout_rate=0.5)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param dropout_rate: The probability of dropout.
-    :type dropout_rate: float
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    return addto_layer(
-        name=name,
-        input=input,
-        act=LinearActivation(),
-        bias_attr=False,
-        layer_attr=ExtraAttr(drop_rate=dropout_rate))
-
-
-@wrap_name_default()
-@wrap_act_default(act=LinearActivation())
-@wrap_param_attr_default()
-@layer_support(DROPOUT)
-def row_conv_layer(input,
-                   context_len,
-                   act=None,
-                   name=None,
-                   param_attr=None,
-                   layer_attr=None):
-    """
-
-    The row convolution is called lookahead convolution. It is firstly
-    introduced in paper of `Deep Speech 2: End-to-End Speech Recognition
-    in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
-
-    The bidirectional RNN that learns representation for a sequence by
-    performing a forward and a backward pass through the entire sequence.
-    However, unlike unidirectional RNNs, bidirectional RNNs are challenging
-    to deploy in an online and low-latency setting. The lookahead convolution
-    incorporates information from future subsequences in a computationally
-    efficient manner to improve unidirectional RNNs.
-
-    The connection of row convolution is different from the 1D sequence
-    convolution. Assumed that, the future context-length is k, that is to say,
-    it can get the output at timestep t by using the the input feature from t-th
-    timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
-    activations are d, the activations r_t for the new layer at time-step t are:
-
-    .. math::
-
-        r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
-                  \quad \\text{for} \quad  (1 \leq i \leq d)
-
-    Note:
-        The `context_len` is `k + 1`. That is to say, the lookahead step
-        number plus one equals context_len.
-
-
-    .. code-block:: python
-
-       row_conv = row_conv_layer(input=input_layer, context_len=3)
-
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param context_len: The context length equals the lookahead step number
-                        plus one.
-    :type context_len: int
-    :param act: Activation Type. LinearActivation is the default activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert context_len > 0, "the context_len must be greatet than 0."
-
-    Layer(
-        inputs=[Input(input.name, **param_attr.attr)],
-        name=name,
-        context_length=context_len,
-        type=LayerType.ROW_CONV_LAYER,
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size)
-
-
-@layer_support()
-@wrap_name_default()
-def prelu_layer(input,
-                name=None,
-                partial_sum=1,
-                channel_shared=None,
-                num_channels=None,
-                param_attr=None,
-                layer_attr=None):
-    """
-    The Parametric Relu activation that actives outputs with a learnable weight.
-
-    Reference:
-        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification <http://arxiv.org/pdf/1502.01852v1.pdf>`_
-
-    .. math::
-       z_i &\\quad if \\quad z_i > 0 \\\\
-       a_i * z_i  &\\quad \\mathrm{otherwise}
-
-    The example usage is:
-
-    .. code-block:: python
-
-       prelu = prelu_layer(input=layers, partial_sum=1)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param partial_sum: this parameter makes a group of inputs share the same weight.
-
-        - partial_sum = 1, indicates the element-wise activation: each element has a weight.
-        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share the same weight.
-        - partial_sum = number of outputs, indicates all elements share the same weight.
-
-    :type partial_sum: int
-    :param channel_shared: whether or not the parameter are shared across channels.
-
-        - channel_shared = True, we set the partial_sum to the number of outputs.
-        - channel_shared = False, we set the partial_sum to the number of elements in one channel.
-
-    :type channel_shared: bool
-    :param num_channels: number of input channel.
-    :type num_channels: int
-    :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
-
-    if not param_attr:
-        param_attr = ParamAttr(initial_mean=0.25, initial_std=0.0)
-    else:
-        assert isinstance(param_attr, ParameterAttribute)
-
-    if num_channels is None:
-        assert input.num_filters is not None, \
-                'the input channel cannot be detected, please specify the num_channels parameter'
-        num_channels = input.num_filters
-
-    if channel_shared is not None:
-        assert isinstance(channel_shared, bool)
-        assert (input.height != 0 and input.width != 0), \
-            'input height and widht must be setted'
-        if channel_shared:
-            partial_sum = input.height * input.width * num_channels
-        else:
-            partial_sum = input.height * input.width
-
-    l = Layer(
-        name=name,
-        type=LayerType.PRELU,
-        inputs=Input(input.name, **param_attr.attr),
-        partial_sum=partial_sum,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.PRELU,
-        parents=input,
-        num_filters=num_channels,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support(ERROR_CLIPPING, DROPOUT)
-@wrap_act_default(act=LinearActivation())
-def gated_unit_layer(input,
-                     size,
-                     act=None,
-                     name=None,
-                     gate_attr=None,
-                     gate_param_attr=None,
-                     gate_bias_attr=True,
-                     inproj_attr=None,
-                     inproj_param_attr=None,
-                     inproj_bias_attr=True,
-                     layer_attr=None):
-    """
-    The gated unit layer implements a simple gating mechanism over the input.
-    The input :math:`X` is first projected into a new space :math:`X'`, and
-    it is also used to produce a gate weight :math:`\sigma`. Element-wise
-    product between :math:`X'` and :math:`\sigma` is finally returned.
-
-    Reference:
-        `Language Modeling with Gated Convolutional Networks
-        <https://arxiv.org/abs/1612.08083>`_
-
-    .. math::
-       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
-
-    The example usage is:
-
-    .. code-block:: python
-        gated_unit = gated_unit_layer(size=128, input=input_layer))
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param size: The dimension of this layer's output.
-    :type size: int
-    :param act: Activation type of the projection. LinearActivation is the default
-                activation.
-    :type act: BaseActivation
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param gate_attr: The extra layer attribute of the gate. See ExtraLayerAttribute for
-                      details.
-    :type gate_attr: ExtraLayerAttribute | None
-    :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
-                            for details.
-    :type gate_param_attr: ParameterAttribute
-    :param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
-                           an object whose type is not ParameterAttribute, no bias is defined.
-                           If this parameter is set to True, the bias is initialized to zero.
-    :type gate_bias_attr: ParameterAttribute | bool | None | Any
-    :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
-                        details.
-    :type inproj_attr: ExtraLayerAttribute | None
-    :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
-                              for details.
-    :type inproj_param_attr: ParameterAttribute
-    :param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
-                             or an object whose type is not ParameterAttribute, no bias is defined.
-                             If this parameter is set to True, the bias is initialized to zero.
-    :type inproj_bias_attr: ParameterAttribute | bool | None | Any
-    :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(
-        input, LayerOutput), 'The gated linear unit accepts only one input.'
-
-    input_proj = fc_layer(
-        input=input,
-        name="%s_input_proj" % name,
-        size=size,
-        act=act,
-        layer_attr=inproj_attr,
-        param_attr=inproj_param_attr,
-        bias_attr=inproj_bias_attr)
-
-    gate = fc_layer(
-        size=size,
-        name="%s_gate" % name,
-        act=SigmoidActivation(),
-        input=input,
-        layer_attr=gate_attr,
-        param_attr=gate_param_attr,
-        bias_attr=gate_bias_attr)
-    return mixed_layer(
-        name="%s_gated_act" % name,
-        input=dotmul_operator(input_proj, gate),
-        layer_attr=layer_attr)
-
-
-@layer_support()
-@wrap_name_default('switch_order')
-def switch_order_layer(input,
-                       name=None,
-                       reshape_axis=None,
-                       act=None,
-                       layer_attr=None):
-    """
-    This layer switch dimension order of image input.
-    From order "batchSize, channels, height, width"
-    to order "batchSize, height, width, channels".
-
-    The example usage is:
-
-    .. code-block:: python
-       reshape_axis = 3
-       switch = switch_order(input=layer, name='switch', reshape_axis=reshape_axis)
-       reshape = {'height':[ 0, 1, 2], 'width':[3]}
-
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param reshape_axis: Specify the axises of 'height'. Its value should be positive and less than 4.
-    :type reshape_axis: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert reshape_axis != None and (reshape_axis > 0 and reshape_axis < 4)
-    height = [ele for ele in xrange(reshape_axis)]
-    width = [ele for ele in range(reshape_axis, 4)]
-    reshape = {'height': height, 'width': width}
-
-    l = Layer(
-        name=name,
-        inputs=input.name,
-        reshape=reshape,
-        type=LayerType.SWITCH_ORDER_LAYER,
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.SWITCH_ORDER_LAYER,
-        activation=act,
-        parents=input,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
-    """
-    This layer crops images according to the offset and shape. Users can set
-    the crop shape through the argument 'shape' explicitly or by specifying a
-    reference input layer.
-
-    The example usage is:
-
-    .. code-block:: python
-    crop = crop_layer(input=[image_input, reference_input], axis=2, offset=[2, 3])
-
-    :param input: The input of this layer. If two inputs are given, the second one
-                  will be regarded as the reference.
-                  And the input must be 4-dims and in NCHW order.
-    :type input: LayerOutput | Sequence
-    :param offset: The crop offset.
-    :type offset: Sequence
-    :param axis: The start axis to be cropped. For image input layer:
-        - 0: batch size
-        - 1: channels
-        - 2: height
-        - 3: width
-    :type axis: int
-    :param shape: The shape to be cropped to. Default is None.
-    :type shape: Sequence | None
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if isinstance(input, LayerOutput):
-        input = [input]
-    else:
-        assert isinstance(input, collections.Sequence)
-    l = Layer(
-        inputs=[x.name for x in input],
-        axis=axis,
-        offset=offset,
-        shape=shape,
-        name=name,
-        type=LayerType.CROP_LAYER,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.CROP_LAYER,
-        parents=input,
-        size=l.config.size)
-
-
-@wrap_name_default()
-@layer_support()
-def sub_nested_seq_layer(input, selected_indices, name=None):
-    """
-    The sub_nested_seq_layer accepts two inputs: the first one is a nested
-    sequence; the second one is a set of selceted indices in the nested sequence.
-
-    Then sub_nest_seq_layer trims the first nested sequence input according
-    to the selected indices to form a new output. This layer is useful in
-    beam training.
-
-    The example usage is:
-
-    .. code-block:: python
-
-        sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
-
-
-    :param input: The input of this layer. It is a nested sequence.
-    :type input: LayerOutput
-    :param selected_indices: A set of sequence indices in the nested sequence.
-    :type input: LayerOutput
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), (
-        'The first input of '
-        'sub_nested_seq_layer must be a Paddle layer.')
-    assert isinstance(selected_indices, LayerOutput), (
-        'The second input of '
-        'sub_nested_seq_layer must be a Paddle layer.')
-
-    l = Layer(
-        inputs=input.name,
-        selected_indices=selected_indices.name,
-        name=name,
-        type=LayerType.SUB_NESTED_SEQ)
-    return LayerOutput(
-        name=name,
-        layer_type=LayerType.SUB_NESTED_SEQ,
-        parents=input,
-        size=l.config.size)
-
-
-@wrap_name_default("clip")
-def clip_layer(input, min, max, name=None):
-    """
-    A layer for clipping the input value by the threshold.
-
-    .. math::
-
-        out[i] = \min (\max (in[i],p_{1} ),p_{2} )
-
-    .. code-block:: python
-
-        clip = clip_layer(input=input_layer, min=-10, max=10)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput.
-    :param min: The lower threshold for clipping.
-    :type min: float
-    :param max: The upper threshold for clipping.
-    :type max: float
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.CLIP_LAYER,
-        inputs=[input.name],
-        min=min,
-        max=max)
-    return LayerOutput(
-        name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default()
-def seq_slice_layer(input, starts, ends, name=None):
-    """
-    seq_slice_layer will return one or several sub-sequences from the
-    input sequence layer given start and end indices.
-
-        - If only start indices are given, and end indices are set to None,
-          this layer slices the input sequence from the given start indices
-          to its end.
-        - If only end indices are given, and start indices are set to None,
-          this layer slices the input sequence from its beginning to the
-          given end indices.
-        - If start and end indices are both given, they should have the same
-          number of elements.
-
-    If start or end indices contains more than one elements, the input sequence
-    will be sliced for multiple times.
-
-
-    .. code-block:: python
-
-        seq_silce = seq_slice_layer(input=input_seq,
-                                    starts=start_pos, ends=end_pos)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer, which should be a sequence.
-    :type input: LayerOutput
-    :param starts: The start indices to slice the input sequence.
-    :type starts: LayerOutput | None
-    :param ends: The end indices to slice the input sequence.
-    :type ends: LayerOutput | None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), (
-        'The first input of seq_slice layer must be a PaddlePaddle layer.')
-
-    if starts is not None:
-        assert isinstance(starts, LayerOutput), (
-            'The start indices for seq_slice layer '
-            'must be a PaddlePaddle layer.')
-    if ends is not None:
-        assert isinstance(ends, LayerOutput), (
-            'The end indices for seq_slice layer must be a PaddlePaddle layer.')
-    assert starts is not None or ends is not None, (
-        'start and end indices '
-        'cannot be set to None at the same time, at least one of '
-        'them should be given.')
-    if starts is not None and ends is not None:
-        assert starts.size == ends.size, (
-            'If start and end indices are both given to seq_slice_layer, '
-            'they should have the same width.')
-
-    Layer(
-        name=name,
-        type=LayerType.SEQ_SLICE,
-        inputs=input.name,
-        starts=starts.name if starts is not None else None,
-        ends=ends.name if ends is not None else None)
-    return LayerOutput(
-        name, LayerType.SEQ_SLICE, parents=[input], size=input.size)
-
-
-@wrap_name_default()
-@layer_support()
-def kmax_seq_score_layer(input, name=None, beam_size=1):
-    """
-    This layer accepts one input which is scores over a sequence or a nested
-    sequence, and returns indices of beam_size sequences with highest scores.
-
-    .. code-block:: python
-
-        kmax_indices = kmax_seq_score_layer(input=input_layer, beam_size)
-
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer. It stores scores over a sequence or
-                  a nested sequence and its size must be 1.
-    :type input: LayerOutput
-    :param beam_size: The indices of the sequences with top beam_size scores are returned.
-    :type beam_size: int
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput), ("kmax_seq_score_layer "
-                                            "accepts only one input.")
-    assert input.size == 1, (
-        "input of kmax_seq_score_layer is a score "
-        "over a sequence or a nested sequence, so its width must be 1.")
-
-    Layer(
-        name=name,
-        type=LayerType.KMAX_SEQ_SCORE,
-        inputs=[input.name],
-        beam_size=beam_size)
-
-    return LayerOutput(
-        name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
-
-
-@wrap_name_default("conv3d")
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-@wrap_act_default(act=ReluActivation())
-@layer_support(DROPOUT)
-def img_conv3d_layer(input,
-                     filter_size,
-                     num_filters,
-                     name=None,
-                     num_channels=None,
-                     act=None,
-                     groups=1,
-                     stride=1,
-                     padding=0,
-                     bias_attr=None,
-                     param_attr=None,
-                     shared_biases=True,
-                     layer_attr=None,
-                     trans=False,
-                     layer_type=None):
-    """
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        conv = img_conv3d_layer(input=data, filter_size=1,
-                              num_channels=8,
-                              num_filters=16, stride=1,
-                              bias_attr=False,
-                              act=ReluActivation())
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param filter_size: The dimensions of the filter kernel along three axises. If the parameter
-                        is set to one integer, the three dimensions will be same.
-    :type filter_size: int | tuple | list
-    :param num_filters: The number of filters. It is as same as the output image channel.
-    :type num_filters: int
-    :param act: Activation type. ReluActivation is the default activation.
-    :type act: BaseActivation
-    :param groups: The number of the filter groups.
-    :type groups: int
-    :param stride: The strides of the convolution along three axises. If the parameter
-                   is set to one integer, the three strides will be same.
-    :type stride: int | tuple | list
-    :param padding: The numbers of padding along three axises. If the parameter is set to
-                    one integer, they will be same.
-    :type padding: int | tuple | list
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None, its actual value will be automatically set to
-                         the channels number of the input.
-    :type num_channels: int
-    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param shared_biases: Whether biases will be shared between filters or not.
-    :type shared_biases: bool
-    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
-                       details.
-    :type layer_attr: ExtraLayerAttribute
-    :param trans: True if it is a convTransLayer, False if it is a convLayer
-    :type trans: bool
-    :param layer_type: Specify the layer type. If the parameter is set, it must be "deconv3d"
-                       when trans=True. If not set, it will be automatically set to "deconv3d"
-                       when trans=True and "conv3d" when trans=False.
-    :type layer_type: basestring
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    if num_channels is None:
-        assert input.num_filters is not None
-        num_channels = input.num_filters
-
-    if isinstance(filter_size, collections.Sequence):
-        assert len(filter_size) == 3
-        filter_size, filter_size_y, filter_size_z = filter_size
-    else:
-        filter_size_y = filter_size
-        filter_size_z = filter_size
-
-    if isinstance(stride, collections.Sequence):
-        assert len(stride) == 3
-        stride, stride_y, stride_z = stride
-    else:
-        stride_y = stride
-        stride_z = stride
-
-    if isinstance(padding, collections.Sequence):
-        assert len(padding) == 3
-        padding, padding_y, padding_z = padding
-    else:
-        padding_y = padding
-        padding_z = padding
-
-    if param_attr.attr.get('initial_smart'):
-        # special initial for conv layers.
-        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
-        param_attr.attr["initial_mean"] = 0.0
-        param_attr.attr["initial_std"] = init_w
-        param_attr.attr["initial_strategy"] = 0
-        param_attr.attr["initial_smart"] = False
-
-    if layer_type:
-        if trans:
-            assert layer_type in ["deconv3d"]
-        lt = layer_type
-    else:
-        lt = LayerType.DECONV3D_LAYER if trans else LayerType.CONV3D_LAYER
-
-    l = Layer(
-        name=name,
-        inputs=Input(
-            input.name,
-            conv=Conv3D(
-                filter_size=filter_size,
-                padding=padding,
-                stride=stride,
-                channels=num_channels,
-                groups=groups,
-                filter_size_y=filter_size_y,
-                padding_y=padding_y,
-                stride_y=stride_y,
-                filter_size_z=filter_size_z,
-                padding_z=padding_z,
-                stride_z=stride_z),
-            **param_attr.attr),
-        active_type=act.name,
-        num_filters=num_filters,
-        bias=ParamAttr.to_bias(bias_attr),
-        shared_biases=shared_biases,
-        type=lt,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name,
-        lt,
-        parents=[input],
-        activation=act,
-        num_filters=num_filters,
-        size=l.config.size)
-
-
-@wrap_name_default("scale_shift")
-@wrap_param_attr_default()
-@wrap_bias_attr_default()
-def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
-    """
-    A layer applies a linear transformation to each element in each row of
-    the input matrix. For each element, the layer first re-scales it and then
-    adds a bias to it.
-
-    This layer is very like the SlopeInterceptLayer, except the scale and
-    bias are trainable.
-
-    .. math::
-
-        y = w * x + b
-
-    .. code-block:: python
-
-        scale_shift = scale_shift_layer(input=input_layer, bias_attr=False)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer.
-    :type input: LayerOutput
-    :param param_attr: The parameter attribute of scaling. See ParameterAttribute for
-                      details.
-    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(
-        name=name,
-        type=LayerType.SCALE_SHIFT_LAYER,
-        inputs=Input(input.name, **param_attr.attr),
-        bias=ParamAttr.to_bias(bias_attr))
-    return LayerOutput(
-        name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
-
-
-@wrap_name_default("resize")
-def resize_layer(input, size, name=None):
-    """
-    The resize layer resizes the input matrix with a shape of [Height, Width]
-    into the output matrix with a shape of [Height x Width / size, size],
-    where size is the parameter of this layer indicating the output dimension.
-
-    :param input: The input of this layer.
-    :type input: LayerOutput.
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param size: The resized output dimension of this layer.
-    :type size: int
-    :return: A LayerOutput object.
-    :rtype: LayerOutput
-    """
-    Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
-    return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
-
-
-@wrap_act_default(act=LinearActivation())
-@wrap_name_default('sub_seq')
-def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
-    """
-    sub_seq_layer will return sub-sequences from the input sequences. For each
-    sequence in the input sequence layer, sub_seq_layer will slice it by given
-    offset and size. Please notice that, number of offset value and size value
-    both are equal to the number of sequence in the input layer.
-
-    .. code-block:: python
-
-        sub_seq = sub_seq_layer(input=input_seq, offsets=offsets, sizes=sizes)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer, which should be sequence.
-    :type input: LayerOutput
-    :param offsets: The offset indices to slice the input sequence, which should
-                    be sequence type.
-    :type offsets: LayerOutput
-    :param sizes: The sizes of the sub-sequences, which should be sequence type.
-    :type sizes: LayerOutput
-    :param act: Activation type, LinearActivation is the default activation.
-    :type act: BaseActivation.
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), (
-        'The first input of sub_seq_layer layer must be a PaddlePaddle layer.')
-    assert isinstance(offsets, LayerOutput), (
-        'The offset indices for sub_seq_layer, '
-        'must be a PaddlePaddle layer.')
-    assert isinstance(sizes, LayerOutput), (
-        'The sizes of sub-sequences, must be a PaddlePaddle layer.')
-
-    Layer(
-        name=name,
-        type=LayerType.SUB_SEQ_LAYER,
-        inputs=[input.name, offsets.name, sizes.name],
-        active_type=act.name,
-        bias=ParamAttr.to_bias(bias_attr))
-
-    return LayerOutput(
-        name,
-        LayerType.SUB_SEQ_LAYER,
-        parents=[input, offsets, sizes],
-        size=input.size)
-
-
-@wrap_name_default('scale_sub_region')
-def scale_sub_region_layer(input, indices, value, name=None):
-    """
-    Given an image or feature map with CHW information, scale_sub_region_layer
-    can be used to multiply a real value to values of a sub continuous region.
-    You can provide start and end indices of CHW for each instance.
-    Please notice that all start indices are counting from 1.
-    The shape of indices should be [batch_size, 6] and the layout for each row
-    is [C_Start, C_End, H_Start, H_End, W_Start, W_End].
-
-    .. code-block:: python
-
-        scale_sub_region = scale_sub_region_layer(input=input,
-                                                  indices=indices,
-                                                  value=value)
-
-    :param name: The name of this layer. It is optional.
-    :type name: basestring
-    :param input: The input of this layer which should contains CHW information.
-    :type input: LayerOutput
-    :param indices: Start index and end index for C H W, the input value should
-                    be a 2-D matrix with shape [batch_size, 6].
-    :type indices: LayerOutput.
-    :param value: value to multiply.
-    :type value: float
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-
-    assert isinstance(input, LayerOutput), (
-        'The first input of scale_sub_region_layer, '
-        'must be a PaddlePaddle layer.')
-    assert isinstance(indices, LayerOutput), (
-        'The start and end indices for CHW, must be a PaddlePaddle layer.')
-    assert isinstance(value, float), (
-        'The value to multiply, must be a real value.')
-
-    Layer(
-        name=name,
-        type=LayerType.SCALE_SUB_REGION_LAYER,
-        inputs=[input.name, indices.name],
-        value=value)
-
-    return LayerOutput(
-        name,
-        LayerType.SCALE_SUB_REGION_LAYER,
-        parents=[input, indices],
-        num_filters=input.num_filters,
-        size=input.size)
-
-
-@wrap_name_default()
-@wrap_act_default(act=LinearActivation())
-@wrap_param_attr_default()
-@layer_support()
-def factorization_machine(input,
-                          factor_size,
-                          act=None,
-                          name=None,
-                          param_attr=None,
-                          layer_attr=None):
-    """
-    The Factorization Machine models pairwise feature interactions as inner
-    product of the learned latent vectors corresponding to each input feature.
-    The Factorization Machine can effectively capture feature interactions
-    especially when the input is sparse.
-
-    This implementation only consider the 2-order feature interactions using
-    Factorization Machine with the formula:
-
-    .. math::
-        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \\rangle x_i x_j
-
-    Note:
-        X is the input vector with size n. V is the factor matrix. Each row of V
-        is the latent vector corresponding to each input dimesion. The size of
-        each latent vector is k.
-
-    For details of Factorization Machine, please refer to the paper:
-    Factorization machines.
-
-    .. code-block:: python
-        first_order = paddle.layer.fc(input=input,
-                                      size=1,
-                                      act=paddle.activation.Linear())
-        second_order = paddle.layer.factorization_machine(input=input,
-                                                          factor_size=10)
-        fm = paddle.layer.addto(input=[first_order, second_order],
-                                act=paddle.activation.Linear(),
-                                bias_attr=False)
-
-    :param input: The input layer. Supported input types: all input data types
-                  on CPU, and only dense input types on GPU.
-    :type input: LayerOutput
-    :param factor_size: The hyperparameter that defines the dimensionality of
-                        the latent vector size.
-    :type context_len: int
-    :param act: Activation Type. Default is linear activation.
-    :type act: BaseActivation
-    :param param_attr: The parameter attribute. See ParameterAttribute for
-                       details.
-    :type param_attr: ParameterAttribute
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    assert isinstance(input, LayerOutput)
-    assert factor_size > 0, "the factor_size must be greater than 0."
-
-    Layer(
-        inputs=[Input(input.name, **param_attr.attr)],
-        name=name,
-        factor_size=factor_size,
-        type=LayerType.FACTORIZATION_MACHINE,
-        active_type=act.name,
-        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
deleted file mode 100644
index b5cde7bac..000000000
--- a/python/paddle/trainer_config_helpers/networks.py
+++ /dev/null
@@ -1,1813 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-
-from activations import LinearActivation, ReluActivation, SoftmaxActivation, \
-    IdentityActivation, TanhActivation, SequenceSoftmaxActivation
-from attrs import ExtraAttr
-from default_decorators import wrap_name_default, wrap_act_default, \
-    wrap_param_default, wrap_bias_attr_default, wrap_param_attr_default
-from layers import *  # There are too many layers used in network, so import *
-from poolings import MaxPooling, SumPooling
-from paddle.trainer.config_parser import *
-
-__all__ = [
-    'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
-    "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
-    'img_conv_group', 'img_separable_conv', 'vgg_16_network', 'gru_unit',
-    'gru_group', 'simple_gru', 'simple_attention', 'dot_product_attention',
-    'multi_head_attention', 'simple_gru2', 'bidirectional_gru',
-    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
-]
-
-######################################################
-#                     Text CNN                       #
-######################################################
-
-
-@wrap_name_default("sequence_conv_pooling")
-def sequence_conv_pool(input,
-                       context_len,
-                       hidden_size,
-                       name=None,
-                       context_start=None,
-                       pool_type=None,
-                       context_proj_layer_name=None,
-                       context_proj_param_attr=False,
-                       fc_layer_name=None,
-                       fc_param_attr=None,
-                       fc_bias_attr=None,
-                       fc_act=None,
-                       pool_bias_attr=None,
-                       fc_attr=None,
-                       context_attr=None,
-                       pool_attr=None):
-    """
-    Text convolution pooling group.
-
-    Text input => Context Projection => FC Layer => Pooling => Output.
-
-    :param name: group name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param context_len: context projection length. See
-                        context_projection's document.
-    :type context_len: int
-    :param hidden_size: FC Layer size.
-    :type hidden_size: int
-    :param context_start: context start position. See
-                          context_projection's context_start.
-    :type context_start: int|None
-    :param pool_type: pooling layer type. See pooling_layer's document.
-    :type pool_type: BasePoolingType
-    :param context_proj_layer_name: context projection layer name.
-                                    None if user don't care.
-    :type context_proj_layer_name: basestring
-    :param context_proj_param_attr: padding parameter attribute of context projection layer.
-                                    If false, it means padding always be zero.
-    :type context_proj_param_attr: ParameterAttribute|None
-    :param fc_layer_name: fc layer name. None if user don't care.
-    :type fc_layer_name: basestring
-    :param fc_param_attr: fc layer parameter attribute. None if user don't care.
-    :type fc_param_attr: ParameterAttribute|None
-    :param fc_bias_attr: fc bias parameter attribute. False if no bias,
-                         None if user don't care.
-    :type fc_bias_attr: ParameterAttribute|False|None
-    :param fc_act: fc layer activation type. None means tanh.
-    :type fc_act: BaseActivation
-    :param pool_bias_attr: pooling layer bias attr. False if no bias.
-                           None if user don't care.
-    :type pool_bias_attr: ParameterAttribute|False|None
-    :param fc_attr: fc layer extra attribute.
-    :type fc_attr: ExtraLayerAttribute
-    :param context_attr: context projection layer extra attribute.
-    :type context_attr: ExtraLayerAttribute
-    :param pool_attr: pooling layer extra attribute.
-    :type pool_attr: ExtraLayerAttribute
-    :return: layer's output.
-    :rtype: LayerOutput
-    """
-    # Set Default Value to param
-    context_proj_layer_name = "%s_conv_proj" % name \
-        if context_proj_layer_name is None else context_proj_layer_name
-
-    with mixed_layer(
-            name=context_proj_layer_name,
-            size=input.size * context_len,
-            act=LinearActivation(),
-            layer_attr=context_attr) as m:
-        m += context_projection(
-            input,
-            context_len=context_len,
-            context_start=context_start,
-            padding_attr=context_proj_param_attr)
-
-    fc_layer_name = "%s_conv_fc" % name \
-        if fc_layer_name is None else fc_layer_name
-    fl = fc_layer(
-        name=fc_layer_name,
-        input=m,
-        size=hidden_size,
-        act=fc_act,
-        layer_attr=fc_attr,
-        param_attr=fc_param_attr,
-        bias_attr=fc_bias_attr)
-
-    return pooling_layer(
-        name=name,
-        input=fl,
-        pooling_type=pool_type,
-        bias_attr=pool_bias_attr,
-        layer_attr=pool_attr)
-
-
-text_conv_pool = sequence_conv_pool
-
-############################################################################
-#                       Images                                             #
-############################################################################
-
-
-@wrap_name_default("conv_pool")
-def simple_img_conv_pool(input,
-                         filter_size,
-                         num_filters,
-                         pool_size,
-                         name=None,
-                         pool_type=None,
-                         act=None,
-                         groups=1,
-                         conv_stride=1,
-                         conv_padding=0,
-                         bias_attr=None,
-                         num_channel=None,
-                         param_attr=None,
-                         shared_bias=True,
-                         conv_layer_attr=None,
-                         pool_stride=1,
-                         pool_padding=0,
-                         pool_layer_attr=None):
-    """
-    Simple image convolution and pooling group.
-
-    Img input => Conv => Pooling => Output.
-
-    :param name: group name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param filter_size: see img_conv_layer for details.
-    :type filter_size: int
-    :param num_filters: see img_conv_layer for details.
-    :type num_filters: int
-    :param pool_size: see img_pool_layer for details.
-    :type pool_size: int
-    :param pool_type: see img_pool_layer for details.
-    :type pool_type: BasePoolingType
-    :param act: see img_conv_layer for details.
-    :type act: BaseActivation
-    :param groups: see img_conv_layer for details.
-    :type groups: int
-    :param conv_stride: see img_conv_layer for details.
-    :type conv_stride: int
-    :param conv_padding: see img_conv_layer for details.
-    :type conv_padding: int
-    :param bias_attr: see img_conv_layer for details.
-    :type bias_attr: ParameterAttribute
-    :param num_channel: see img_conv_layer for details.
-    :type num_channel: int
-    :param param_attr: see img_conv_layer for details.
-    :type param_attr: ParameterAttribute
-    :param shared_bias: see img_conv_layer for details.
-    :type shared_bias: bool
-    :param conv_layer_attr: see img_conv_layer for details.
-    :type conv_layer_attr: ExtraLayerAttribute
-    :param pool_stride: see img_pool_layer for details.
-    :type pool_stride: int
-    :param pool_padding: see img_pool_layer for details.
-    :type pool_padding: int
-    :param pool_layer_attr: see img_pool_layer for details.
-    :type pool_layer_attr: ExtraLayerAttribute
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-    _conv_ = img_conv_layer(
-        name="%s_conv" % name,
-        input=input,
-        filter_size=filter_size,
-        num_filters=num_filters,
-        num_channels=num_channel,
-        act=act,
-        groups=groups,
-        stride=conv_stride,
-        padding=conv_padding,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        shared_biases=shared_bias,
-        layer_attr=conv_layer_attr)
-    return img_pool_layer(
-        name="%s_pool" % name,
-        input=_conv_,
-        pool_size=pool_size,
-        pool_type=pool_type,
-        stride=pool_stride,
-        padding=pool_padding,
-        layer_attr=pool_layer_attr)
-
-
-@wrap_name_default("conv_bn_pool")
-def img_conv_bn_pool(input,
-                     filter_size,
-                     num_filters,
-                     pool_size,
-                     name=None,
-                     pool_type=None,
-                     act=None,
-                     groups=1,
-                     conv_stride=1,
-                     conv_padding=0,
-                     conv_bias_attr=None,
-                     num_channel=None,
-                     conv_param_attr=None,
-                     shared_bias=True,
-                     conv_layer_attr=None,
-                     bn_param_attr=None,
-                     bn_bias_attr=None,
-                     bn_layer_attr=None,
-                     pool_stride=1,
-                     pool_padding=0,
-                     pool_layer_attr=None):
-    """
-    Convolution, batch normalization, pooling group.
-
-    Img input => Conv => BN => Pooling => Output.
-
-    :param name: group name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param filter_size: see img_conv_layer for details.
-    :type filter_size: int
-    :param num_filters: see img_conv_layer for details.
-    :type num_filters: int
-    :param pool_size: see img_pool_layer for details.
-    :type pool_size: int
-    :param pool_type: see img_pool_layer for details.
-    :type pool_type: BasePoolingType
-    :param act: see batch_norm_layer for details.
-    :type act: BaseActivation
-    :param groups: see img_conv_layer for details.
-    :type groups: int
-    :param conv_stride: see img_conv_layer for details.
-    :type conv_stride: int
-    :param conv_padding: see img_conv_layer for details.
-    :type conv_padding: int
-    :param conv_bias_attr: see img_conv_layer for details.
-    :type conv_bias_attr: ParameterAttribute
-    :param num_channel: see img_conv_layer for details.
-    :type num_channel: int
-    :param conv_param_attr: see img_conv_layer for details.
-    :type conv_param_attr: ParameterAttribute
-    :param shared_bias: see img_conv_layer for details.
-    :type shared_bias: bool
-    :param conv_layer_attr: see img_conv_layer for details.
-    :type conv_layer_attr: ExtraLayerOutput
-    :param bn_param_attr: see batch_norm_layer for details.
-    :type bn_param_attr: ParameterAttribute
-    :param bn_bias_attr: see batch_norm_layer for details.
-    :type bn_bias_attr: ParameterAttribute
-    :param bn_layer_attr: see batch_norm_layer for details.
-    :type bn_layer_attr: ExtraLayerAttribute
-    :param pool_stride: see img_pool_layer for details.
-    :type pool_stride: int
-    :param pool_padding: see img_pool_layer for details.
-    :type pool_padding: int
-    :param pool_layer_attr: see img_pool_layer for details.
-    :type pool_layer_attr: ExtraLayerAttribute
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-    __conv__ = img_conv_layer(
-        name="%s_conv" % name,
-        input=input,
-        filter_size=filter_size,
-        num_filters=num_filters,
-        num_channels=num_channel,
-        act=LinearActivation(),
-        groups=groups,
-        stride=conv_stride,
-        padding=conv_padding,
-        bias_attr=conv_bias_attr,
-        param_attr=conv_param_attr,
-        shared_biases=shared_bias,
-        layer_attr=conv_layer_attr)
-    __bn__ = batch_norm_layer(
-        name="%s_bn" % name,
-        input=__conv__,
-        act=act,
-        bias_attr=bn_bias_attr,
-        param_attr=bn_param_attr,
-        layer_attr=bn_layer_attr)
-    return img_pool_layer(
-        name="%s_pool" % name,
-        input=__bn__,
-        pool_type=pool_type,
-        pool_size=pool_size,
-        stride=pool_stride,
-        padding=pool_padding,
-        layer_attr=pool_layer_attr)
-
-
-@wrap_act_default(param_names=['conv_act'], act=ReluActivation())
-@wrap_param_default(
-    param_names=['pool_type'], default_factory=lambda _: MaxPooling())
-def img_conv_group(input,
-                   conv_num_filter,
-                   pool_size,
-                   num_channels=None,
-                   conv_padding=1,
-                   conv_filter_size=3,
-                   conv_act=None,
-                   conv_with_batchnorm=False,
-                   conv_batchnorm_drop_rate=0,
-                   pool_stride=1,
-                   pool_type=None,
-                   param_attr=None):
-    """
-    Image Convolution Group, Used for vgg net.
-
-    :param conv_batchnorm_drop_rate: if conv_with_batchnorm[i] is true,
-        conv_batchnorm_drop_rate[i] represents the drop rate of each batch norm.
-    :type conv_batchnorm_drop_rate: list
-    :param input: input layer.
-    :type input: LayerOutput
-    :param conv_num_filter: list of output channels num.
-    :type conv_num_filter: list|tuple
-    :param pool_size: pooling filter size.
-    :type pool_size: int
-    :param num_channels: input channels num.
-    :type num_channels: int
-    :param conv_padding: convolution padding size.
-    :type conv_padding: int
-    :param conv_filter_size: convolution filter size.
-    :type conv_filter_size: int
-    :param conv_act: activation funciton after convolution.
-    :type conv_act: BaseActivation
-    :param conv_with_batchnorm: if conv_with_batchnorm[i] is true,
-        there is a batch normalization operation after each convolution.
-    :type conv_with_batchnorm: list
-    :param pool_stride: pooling stride size.
-    :type pool_stride: int
-    :param pool_type: pooling type.
-    :type pool_type: BasePoolingType
-    :param param_attr: param attribute of convolution layer,
-                       None means default attribute.
-    :type param_attr: ParameterAttribute
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-    tmp = input
-
-    # Type checks
-    assert isinstance(tmp, LayerOutput)
-    assert isinstance(conv_num_filter, list) or isinstance(conv_num_filter,
-                                                           tuple)
-    for each_num_filter in conv_num_filter:
-        assert isinstance(each_num_filter, int)
-
-    assert isinstance(pool_size, int)
-
-    def __extend_list__(obj):
-        if not hasattr(obj, '__len__'):
-            return [obj] * len(conv_num_filter)
-        else:
-            return obj
-
-    conv_padding = __extend_list__(conv_padding)
-    conv_filter_size = __extend_list__(conv_filter_size)
-    conv_act = __extend_list__(conv_act)
-    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
-    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
-
-    for i in xrange(len(conv_num_filter)):
-        extra_kwargs = dict()
-        if num_channels is not None:
-            extra_kwargs['num_channels'] = num_channels
-            num_channels = None
-        if conv_with_batchnorm[i]:
-            extra_kwargs['act'] = LinearActivation()
-        else:
-            extra_kwargs['act'] = conv_act[i]
-
-        tmp = img_conv_layer(
-            input=tmp,
-            padding=conv_padding[i],
-            filter_size=conv_filter_size[i],
-            num_filters=conv_num_filter[i],
-            param_attr=param_attr,
-            **extra_kwargs)
-
-        # logger.debug("tmp.num_filters = %d" % tmp.num_filters)
-
-        if conv_with_batchnorm[i]:
-            dropout = conv_batchnorm_drop_rate[i]
-            if dropout == 0 or abs(dropout) < 1e-5:  # dropout not set
-                tmp = batch_norm_layer(input=tmp, act=conv_act[i])
-            else:
-                tmp = batch_norm_layer(
-                    input=tmp,
-                    act=conv_act[i],
-                    layer_attr=ExtraAttr(drop_rate=dropout))
-
-    return img_pool_layer(
-        input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)
-
-
-@wrap_name_default("separable_conv")
-def img_separable_conv(input,
-                       num_channels,
-                       num_out_channels,
-                       filter_size,
-                       stride=1,
-                       padding=0,
-                       depth_multiplier=1,
-                       act=None,
-                       bias_attr=None,
-                       param_attr=None,
-                       shared_bias=True,
-                       layer_type='exconv',
-                       name=None):
-    """
-    Separable Convolution.
-
-    The separable convolution module is consisted of a depthwise convolution
-    that acts separately on input channels, followed by a pointwise convolution
-    with 1*1 kernels that mixes channels. It is used for Xception:
-    https://arxiv.org/pdf/1610.02357.pdf
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param num_channels: the number of input channels.
-    :type num_channels: int
-    :param num_out_channels: the number of output channels.
-    :type num_out_channels: int
-    :param filter_size: the filter size for the depthwise convolution.
-    :type filter_size: int|tuple
-    :param stride: the stride size for the depthwise convolution.
-    :type stride: int|tuple
-    :param padding: the padding size for the depthwise convolution.
-    :type padding: int|tuple
-    :param depth_multiplier: the number of filter for one channel in the
-                             depthwize convolution.
-    :type depth_multiplier: int
-    :param act: the activation function for the output.
-    :type act: BaseActivation
-    :param bias_attr: see img_conv_layer for details.
-    :type bias_attr: ParameterAttribute
-    :param param_attr: see img_conv_layer for details.
-    :type param_attr: ParameterAttribute
-    :param shared_bias: see img_conv_layer for details.
-    :type shared_bias: bool
-    :param layer_type: see img_conv_layer for details.
-    :type layer_type: bool
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-    __depthwise_conv__ = img_conv_layer(
-        name="%s_depthwise_conv" % name,
-        input=input,
-        num_channels=num_channels,
-        num_filters=num_channels * depth_multiplier,
-        groups=num_channels,
-        filter_size=filter_size,
-        stride=stride,
-        padding=padding,
-        act=LinearActivation(),
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        shared_biases=shared_bias,
-        layer_type=layer_type)
-    __pointwise_conv__ = img_conv_layer(
-        name="%s_pointwise_conv" % name,
-        input=__depthwise_conv__,
-        num_channels=num_channels * depth_multiplier,
-        num_filters=num_out_channels,
-        filter_size=1,
-        stride=1,
-        padding=0,
-        act=act,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        shared_biases=shared_bias)
-    return __pointwise_conv__
-
-
-def small_vgg(input_image, num_channels, num_classes):
-    def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
-        return img_conv_group(
-            input=ipt,
-            num_channels=num_channels_,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * times,
-            conv_filter_size=3,
-            conv_act=ReluActivation(),
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type=MaxPooling())
-
-    tmp = __vgg__(input_image, 64, 2, [0.3, 0], num_channels)
-    tmp = __vgg__(tmp, 128, 2, [0.4, 0])
-    tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0])
-    tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0])
-    tmp = img_pool_layer(
-        input=tmp, stride=2, pool_size=2, pool_type=MaxPooling())
-    tmp = dropout_layer(input=tmp, dropout_rate=0.5)
-    tmp = fc_layer(
-        input=tmp,
-        size=512,
-        layer_attr=ExtraAttr(drop_rate=0.5),
-        act=LinearActivation())
-    tmp = batch_norm_layer(input=tmp, act=ReluActivation())
-    return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
-
-
-def vgg_16_network(input_image, num_channels, num_classes=1000):
-    """
-    Same model from https://gist.github.com/ksimonyan/211839e770f7b538e2d8
-
-    :param num_classes: number of class.
-    :type num_classes: int
-    :param input_image: input layer.
-    :type input_image: LayerOutput
-    :param num_channels: input channels num.
-    :type num_channels: int
-    :return: layer's output
-    :rtype: LayerOutput
-    """
-
-    tmp = img_conv_group(
-        input=input_image,
-        num_channels=num_channels,
-        conv_padding=1,
-        conv_num_filter=[64, 64],
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_size=2,
-        pool_stride=2,
-        pool_type=MaxPooling())
-
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[128, 128],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[256, 256, 256],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[512, 512, 512],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-    tmp = img_conv_group(
-        input=tmp,
-        conv_num_filter=[512, 512, 512],
-        conv_padding=1,
-        conv_filter_size=3,
-        conv_act=ReluActivation(),
-        pool_stride=2,
-        pool_type=MaxPooling(),
-        pool_size=2)
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    tmp = fc_layer(
-        input=tmp,
-        size=4096,
-        act=ReluActivation(),
-        layer_attr=ExtraAttr(drop_rate=0.5))
-
-    return fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
-
-
-############################################################################
-#                       Recurrent                                          #
-############################################################################
-
-
-@wrap_name_default("lstm")
-def simple_lstm(input,
-                size,
-                name=None,
-                reverse=False,
-                mat_param_attr=None,
-                bias_param_attr=None,
-                inner_param_attr=None,
-                act=None,
-                gate_act=None,
-                state_act=None,
-                mixed_layer_attr=None,
-                lstm_cell_attr=None):
-    """
-    Simple LSTM Cell.
-
-    It just combines a mixed layer with fully_matrix_projection and a lstmemory
-    layer. The simple lstm cell was implemented with follow equations.
-
-    ..  math::
-
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
-
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
-
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
-
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
-
-        h_t & = o_t tanh(c_t)
-
-    Please refer to **Generating Sequences With Recurrent Neural Networks** for more
-    details about lstm. Link_ is here.
-
-    .. _Link: http://arxiv.org/abs/1308.0850
-
-    :param name: lstm layer name.
-    :type name: basestring
-    :param input: layer's input.
-    :type input: LayerOutput
-    :param size: lstm layer size.
-    :type size: int
-    :param reverse: process the input in a reverse order or not.
-    :type reverse: bool
-    :param mat_param_attr: parameter attribute of matrix projection in mixed layer.
-    :type mat_param_attr: ParameterAttribute
-    :param bias_param_attr: bias parameter attribute. False means no bias, None
-                            means default bias.
-    :type bias_param_attr: ParameterAttribute|False
-    :param inner_param_attr: parameter attribute of lstm cell.
-    :type inner_param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
-    :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
-    :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
-    :type state_act: BaseActivation
-    :param mixed_layer_attr: extra attribute of mixed layer.
-    :type mixed_layer_attr: ExtraLayerAttribute
-    :param lstm_cell_attr: extra attribute of lstm.
-    :type lstm_cell_attr: ExtraLayerAttribute
-    :return: layer's output.
-    :rtype: LayerOutput
-    """
-    fc_name = 'lstm_transform_%s' % name
-    with mixed_layer(
-            name=fc_name,
-            size=size * 4,
-            act=IdentityActivation(),
-            layer_attr=mixed_layer_attr,
-            bias_attr=False) as m:
-        m += full_matrix_projection(input, param_attr=mat_param_attr)
-
-    return lstmemory(
-        name=name,
-        input=m,
-        reverse=reverse,
-        bias_attr=bias_param_attr,
-        param_attr=inner_param_attr,
-        act=act,
-        gate_act=gate_act,
-        state_act=state_act,
-        layer_attr=lstm_cell_attr)
-
-
-@wrap_name_default('lstm_unit')
-def lstmemory_unit(input,
-                   out_memory=None,
-                   name=None,
-                   size=None,
-                   param_attr=None,
-                   act=None,
-                   gate_act=None,
-                   state_act=None,
-                   input_proj_bias_attr=None,
-                   input_proj_layer_attr=None,
-                   lstm_bias_attr=None,
-                   lstm_layer_attr=None):
-    """
-    lstmemory_unit defines the caculation process of a LSTM unit during a
-    single time step. This function is not a recurrent layer, so it can not be
-    directly used to process sequence input. This function is always used in
-    recurrent_group (see layers.py for more details) to implement attention
-    mechanism.
-
-    Please refer to  **Generating Sequences With Recurrent Neural Networks**
-    for more details about LSTM. The link goes as follows:
-    .. _Link: https://arxiv.org/abs/1308.0850
-
-    ..  math::
-
-        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
-
-        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
-
-        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
-
-        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
-
-        h_t & = o_t tanh(c_t)
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        lstm_step = lstmemory_unit(input=[layer1],
-                                   size=256,
-                                   act=TanhActivation(),
-                                   gate_act=SigmoidActivation(),
-                                   state_act=TanhActivation())
-
-
-    :param input: Input layer.
-    :type input: LayerOutput
-    :param out_memory: The output of previous time step.
-    :type out_memory: LayerOutput | None
-    :param name: The lstmemory unit name.
-    :type name: basestring
-    :param size: The lstmemory unit size.
-    :type size: int
-    :param param_attr: The parameter attribute for the weights in
-                     input to hidden projection.
-                     None means default attribute.
-    :type param_attr: ParameterAttribute
-    :param act: The last activiation type of lstm.
-    :type act: BaseActivation
-    :param gate_act: The gate activiation type of lstm.
-    :type gate_act: BaseActivation
-    :param state_act: The state activiation type of lstm.
-    :type state_act: BaseActivation
-    :param input_proj_bias_attr: The parameter attribute for the bias in
-                      input to hidden projection.
-                      False or None means no bias.
-                      If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type input_proj_bias_attr: ParameterAttribute|bool|None
-    :param input_proj_layer_attr: The extra layer attribute for
-                     input to hidden projection of the LSTM unit,
-                     such as dropout, error clipping.
-    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
-                      False or None means no bias.
-                      If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type lstm_bias_attr: ParameterAttribute|True|None
-    :param lstm_layer_attr: The extra attribute of lstm layer.
-    :type lstm_layer_attr: ExtraLayerAttribute
-    :return: The lstmemory unit name.
-    :rtype: LayerOutput
-    """
-    if size is None:
-        assert input.size % 4 == 0
-        size = input.size / 4
-    if out_memory is None:
-        out_mem = memory(name=name, size=size)
-    else:
-        out_mem = out_memory
-
-    state_mem = memory(name="%s_state" % name, size=size)
-
-    with mixed_layer(
-            name="%s_input_recurrent" % name,
-            size=size * 4,
-            bias_attr=input_proj_bias_attr,
-            layer_attr=input_proj_layer_attr,
-            act=IdentityActivation()) as m:
-        m += identity_projection(input=input)
-        m += full_matrix_projection(input=out_mem, param_attr=param_attr)
-
-    lstm_out = lstm_step_layer(
-        name=name,
-        input=m,
-        state=state_mem,
-        size=size,
-        bias_attr=lstm_bias_attr,
-        act=act,
-        gate_act=gate_act,
-        state_act=state_act,
-        layer_attr=lstm_layer_attr)
-    get_output_layer(name='%s_state' % name, input=lstm_out, arg_name='state')
-
-    return lstm_out
-
-
-@wrap_name_default('lstm_group')
-def lstmemory_group(input,
-                    size=None,
-                    name=None,
-                    out_memory=None,
-                    reverse=False,
-                    param_attr=None,
-                    act=None,
-                    gate_act=None,
-                    state_act=None,
-                    input_proj_bias_attr=None,
-                    input_proj_layer_attr=None,
-                    lstm_bias_attr=None,
-                    lstm_layer_attr=None):
-    """
-    lstm_group is a recurrent_group version of Long Short Term Memory. It
-    does exactly the same calculation as the lstmemory layer (see lstmemory in
-    layers.py for the maths) does. A promising benefit is that LSTM memory
-    cell states(or hidden states) in every time step are accessible to the
-    user. This is especially useful in attention model. If you do not need to
-    access the internal states of the lstm and merely use its outputs,
-    it is recommended to use the lstmemory, which is relatively faster than
-    lstmemory_group.
-
-    NOTE: In PaddlePaddle's implementation, the following input-to-hidden
-    multiplications:
-    :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
-    :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
-    speed up the calculations. Consequently, an additional mixed_layer with
-    full_matrix_projection must be included before lstmemory_unit is called.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        lstm_step = lstmemory_group(input=[layer1],
-                                    size=256,
-                                    act=TanhActivation(),
-                                    gate_act=SigmoidActivation(),
-                                    state_act=TanhActivation())
-
-    :param input: Input layer.
-    :type input: LayerOutput
-    :param size: The lstmemory group size.
-    :type size: int
-    :param name: The name of lstmemory group.
-    :type name: basestring
-    :param out_memory: The output of previous time step.
-    :type out_memory: LayerOutput | None
-    :param reverse: Process the input in a reverse order or not.
-    :type reverse: bool
-    :param param_attr: The parameter attribute for the weights in
-                     input to hidden projection.
-                     None means default attribute.
-    :type param_attr: ParameterAttribute
-    :param act: The last activiation type of lstm.
-    :type act: BaseActivation
-    :param gate_act: The gate activiation type of lstm.
-    :type gate_act: BaseActivation
-    :param state_act: The state activiation type of lstm.
-    :type state_act: BaseActivation
-    :param input_proj_bias_attr: The parameter attribute for the bias in
-                      input to hidden projection.
-                      False or None means no bias.
-                      If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type input_proj_bias_attr: ParameterAttribute|bool|None
-    :param input_proj_layer_attr: The extra layer attribute for
-                     input to hidden projection of the LSTM unit,
-                     such as dropout, error clipping.
-    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
-                      False or None means no bias.
-                      If this parameter is set to True,
-                      the bias is initialized to zero.
-    :type lstm_bias_attr: ParameterAttribute|True|None
-    :param lstm_layer_attr: The extra attribute of lstm layer.
-    :type lstm_layer_attr: ExtraLayerAttribute
-    :return: the lstmemory group.
-    :rtype: LayerOutput
-    """
-
-    def __lstm_step__(ipt):
-        return lstmemory_unit(
-            input=ipt,
-            name=name,
-            size=size,
-            act=act,
-            gate_act=gate_act,
-            state_act=state_act,
-            out_memory=out_memory,
-            input_proj_bias_attr=input_proj_bias_attr,
-            input_proj_layer_attr=input_proj_layer_attr,
-            param_attr=param_attr,
-            lstm_layer_attr=lstm_layer_attr,
-            lstm_bias_attr=lstm_bias_attr)
-
-    return recurrent_group(
-        name='%s_recurrent_group' % name,
-        step=__lstm_step__,
-        reverse=reverse,
-        input=input)
-
-
-@wrap_name_default('gru_unit')
-def gru_unit(input,
-             memory_boot=None,
-             size=None,
-             name=None,
-             gru_bias_attr=None,
-             gru_param_attr=None,
-             act=None,
-             gate_act=None,
-             gru_layer_attr=None,
-             naive=False):
-    """
-    gru_unit defines the calculation process of a gated recurrent unit during a single
-    time step. This function is not a recurrent layer, so it can not be
-    directly used to process sequence input. This function is always used in
-    the recurrent_group (see layers.py for more details) to implement attention
-    mechanism.
-
-    Please see grumemory in layers.py for the details about the maths.
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param memory_boot: the initialization state of the LSTM cell.
-    :type memory_boot: LayerOutput | None
-    :param name: name of the gru group.
-    :type name: basestring
-    :param size: hidden size of the gru.
-    :type size: int
-    :param act: activation type of gru
-    :type act: BaseActivation
-    :param gate_act: gate activation type or gru
-    :type gate_act: BaseActivation
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
-    :return: the gru output layer.
-    :rtype: LayerOutput
-    """
-
-    assert input.size % 3 == 0
-    if size is None:
-        size = input.size / 3
-
-    out_mem = memory(name=name, size=size, boot_layer=memory_boot)
-
-    if naive:
-        __step__ = gru_step_naive_layer
-    else:
-        __step__ = gru_step_layer
-
-    gru_out = __step__(
-        name=name,
-        input=input,
-        output_mem=out_mem,
-        size=size,
-        bias_attr=gru_bias_attr,
-        param_attr=gru_param_attr,
-        act=act,
-        gate_act=gate_act,
-        layer_attr=gru_layer_attr)
-    return gru_out
-
-
-@wrap_name_default('gru_group')
-def gru_group(input,
-              memory_boot=None,
-              size=None,
-              name=None,
-              reverse=False,
-              gru_bias_attr=None,
-              gru_param_attr=None,
-              act=None,
-              gate_act=None,
-              gru_layer_attr=None,
-              naive=False):
-    """
-    gru_group is a recurrent_group version of Gated Recurrent Unit. It
-    does exactly the same calculation as the grumemory layer does. A promising
-    benefit is that gru hidden states are accessible to the user. This is
-    especially useful in attention model. If you do not need to access
-    any internal state and merely use the outputs of a GRU, it is recommended
-    to use the grumemory, which is relatively faster.
-
-    Please see grumemory in layers.py for more detail about the maths.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        gru = gru_group(input=[layer1],
-                        size=256,
-                        act=TanhActivation(),
-                        gate_act=SigmoidActivation())
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param memory_boot: the initialization state of the LSTM cell.
-    :type memory_boot: LayerOutput | None
-    :param name: name of the gru group.
-    :type name: basestring
-    :param size: hidden size of the gru.
-    :type size: int
-    :param reverse: process the input in a reverse order or not.
-    :type reverse: bool
-    :param act: activiation type of gru
-    :type act: BaseActivation
-    :param gate_act: gate activiation type of gru
-    :type gate_act: BaseActivation
-    :param gru_bias_attr: bias parameter attribute of gru layer,
-                          False means no bias, None means default bias.
-    :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
-    :return: the gru group.
-    :rtype: LayerOutput
-    """
-
-    def __gru_step__(ipt):
-        return gru_unit(
-            input=ipt,
-            memory_boot=memory_boot,
-            name=name,
-            size=size,
-            gru_bias_attr=gru_bias_attr,
-            gru_param_attr=gru_param_attr,
-            act=act,
-            gate_act=gate_act,
-            gru_layer_attr=gru_layer_attr,
-            naive=naive)
-
-    return recurrent_group(
-        name='%s_recurrent_group' % name,
-        step=__gru_step__,
-        reverse=reverse,
-        input=input)
-
-
-@wrap_name_default('simple_gru')
-def simple_gru(input,
-               size,
-               name=None,
-               reverse=False,
-               mixed_param_attr=None,
-               mixed_bias_param_attr=None,
-               mixed_layer_attr=None,
-               gru_bias_attr=None,
-               gru_param_attr=None,
-               act=None,
-               gate_act=None,
-               gru_layer_attr=None,
-               naive=False):
-    """
-    You may see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
-    simple_gru in network.py. The reason why there are so many interfaces is
-    that we have two ways to implement recurrent neural network. One way is to
-    use one complete layer to implement rnn (including simple rnn, gru and lstm)
-    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But
-    the multiplication operation :math:`W x_t` is not computed in these layers.
-    See details in their interfaces in layers.py.
-    The other implementation is to use an recurrent group which can ensemble a
-    series of layers to compute rnn step by step. This way is flexible for
-    attenion mechanism or other complex connections.
-
-    - gru_step_layer: only compute rnn by one step. It needs an memory as input
-      and can be used in recurrent group.
-    - gru_unit: a wrapper of gru_step_layer with memory.
-    - gru_group: a GRU cell implemented by a combination of multiple layers in
-      recurrent group.
-      But :math:`W x_t` is not done in group.
-    - gru_memory: a GRU cell implemented by one layer, which does same calculation
-      with gru_group and is faster than gru_group.
-    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and
-      gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
-      formula in grumemory.
-
-    The computational speed is that, grumemory is relatively better than
-    gru_group, and gru_group is relatively better than simple_gru.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        gru = simple_gru(input=[layer1], size=256)
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param name: name of the gru group.
-    :type name: basestring
-    :param size: hidden size of the gru.
-    :type size: int
-    :param reverse: process the input in a reverse order or not.
-    :type reverse: bool
-    :param act: activiation type of gru
-    :type act: BaseActivation
-    :param gate_act: gate activiation type of gru
-    :type gate_act: BaseActivation
-    :param gru_bias_attr: bias parameter attribute of gru layer,
-                          False means no bias, None means default bias.
-    :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_layer_attr: Extra attribute of the gru layer.
-    :type gru_layer_attr: ExtraLayerAttribute
-    :return: the gru group.
-    :rtype: LayerOutput
-    """
-    with mixed_layer(
-            name='%s_transform' % name,
-            size=size * 3,
-            bias_attr=mixed_bias_param_attr,
-            layer_attr=mixed_layer_attr) as m:
-        m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
-
-    return gru_group(
-        name=name,
-        size=size,
-        input=m,
-        reverse=reverse,
-        gru_bias_attr=gru_bias_attr,
-        gru_param_attr=gru_param_attr,
-        act=act,
-        gate_act=gate_act,
-        gru_layer_attr=gru_layer_attr,
-        naive=naive)
-
-
-@wrap_name_default('simple_gru2')
-def simple_gru2(input,
-                size,
-                name=None,
-                reverse=False,
-                mixed_param_attr=None,
-                mixed_bias_attr=None,
-                gru_param_attr=None,
-                gru_bias_attr=None,
-                act=None,
-                gate_act=None,
-                mixed_layer_attr=None,
-                gru_cell_attr=None):
-    """
-    simple_gru2 is the same with simple_gru, but using grumemory instead.
-    Please refer to grumemory in layers.py for more detail about the math.
-    simple_gru2 is faster than simple_gru.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        gru = simple_gru2(input=[layer1], size=256)
-
-    :param input: input layer.
-    :type input: LayerOutput
-    :param name: name of the gru group.
-    :type name: basestring
-    :param size: hidden size of the gru.
-    :type size: int
-    :param reverse: process the input in a reverse order or not.
-    :type reverse: bool
-    :param act: activiation type of gru
-    :type act: BaseActivation
-    :param gate_act: gate activiation type of gru
-    :type gate_act: BaseActivation
-    :param gru_bias_attr: bias parameter attribute of gru layer,
-                          False means no bias, None means default bias.
-    :type gru_bias_attr: ParameterAttribute|False|None
-    :param gru_param_attr: param parameter attribute of gru layer,
-                          None means default param.
-    :type gru_param_attr: ParameterAttribute|None
-    :return: the gru group.
-    :rtype: LayerOutput
-    """
-    with mixed_layer(
-            name='%s_transform' % name,
-            size=size * 3,
-            bias_attr=mixed_bias_attr,
-            layer_attr=mixed_layer_attr) as m:
-        m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
-
-    return grumemory(
-        name=name,
-        input=m,
-        reverse=reverse,
-        bias_attr=gru_bias_attr,
-        param_attr=gru_param_attr,
-        act=act,
-        gate_act=gate_act,
-        layer_attr=gru_cell_attr)
-
-
-@wrap_name_default("bidirectional_gru")
-def bidirectional_gru(input,
-                      size,
-                      name=None,
-                      return_seq=False,
-                      fwd_mixed_param_attr=None,
-                      fwd_mixed_bias_attr=None,
-                      fwd_gru_param_attr=None,
-                      fwd_gru_bias_attr=None,
-                      fwd_act=None,
-                      fwd_gate_act=None,
-                      fwd_mixed_layer_attr=None,
-                      fwd_gru_cell_attr=None,
-                      bwd_mixed_param_attr=None,
-                      bwd_mixed_bias_attr=None,
-                      bwd_gru_param_attr=None,
-                      bwd_gru_bias_attr=None,
-                      bwd_act=None,
-                      bwd_gate_act=None,
-                      bwd_mixed_layer_attr=None,
-                      bwd_gru_cell_attr=None,
-                      last_seq_attr=None,
-                      first_seq_attr=None,
-                      concat_attr=None,
-                      concat_act=None):
-    """
-    A bidirectional_gru is a recurrent unit that iterates over the input
-    sequence both in forward and backward orders, and then concatenate two
-    outputs to form a final output. However, concatenation of two outputs
-    is not the only way to form the final output, you can also, for example,
-    just add them together.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        bi_gru = bidirectional_gru(input=[input1], size=512)
-
-    :param name: bidirectional gru layer name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param size: gru layer size.
-    :type size: int
-    :param return_seq: If set False, the last time step of output are
-                       concatenated and returned.
-                       If set True, the entire output sequences in forward
-                       and backward directions are concatenated and returned.
-    :type return_seq: bool
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    args = locals()
-
-    fw = simple_gru2(
-        name='%s_fw' % name,
-        input=input,
-        size=size,
-        **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
-               if k.startswith('fwd_')))
-
-    bw = simple_gru2(
-        name="%s_bw" % name,
-        input=input,
-        size=size,
-        reverse=True,
-        **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
-               if k.startswith('bwd_')))
-
-    if return_seq:
-        return concat_layer(
-            name=name, input=[fw, bw], layer_attr=concat_attr, act=concat_act)
-    else:
-        fw_seq = last_seq(
-            name="%s_fw_last" % name, input=fw, layer_attr=last_seq_attr)
-        bw_seq = first_seq(
-            name="%s_bw_last" % name, input=bw, layer_attr=first_seq_attr)
-        return concat_layer(
-            name=name,
-            input=[fw_seq, bw_seq],
-            layer_attr=concat_attr,
-            act=concat_act)
-
-
-@wrap_name_default("bidirectional_lstm")
-def bidirectional_lstm(input,
-                       size,
-                       name=None,
-                       return_seq=False,
-                       fwd_mat_param_attr=None,
-                       fwd_bias_param_attr=None,
-                       fwd_inner_param_attr=None,
-                       fwd_act=None,
-                       fwd_gate_act=None,
-                       fwd_state_act=None,
-                       fwd_mixed_layer_attr=None,
-                       fwd_lstm_cell_attr=None,
-                       bwd_mat_param_attr=None,
-                       bwd_bias_param_attr=None,
-                       bwd_inner_param_attr=None,
-                       bwd_act=None,
-                       bwd_gate_act=None,
-                       bwd_state_act=None,
-                       bwd_mixed_layer_attr=None,
-                       bwd_lstm_cell_attr=None,
-                       last_seq_attr=None,
-                       first_seq_attr=None,
-                       concat_attr=None,
-                       concat_act=None):
-    """
-    A bidirectional_lstm is a recurrent unit that iterates over the input
-    sequence both in forward and backward orders, and then concatenate two
-    outputs to form a final output. However, concatenation of two outputs
-    is not the only way to form the final output, you can also, for example,
-    just add them together.
-
-    Please refer to  **Neural Machine Translation by Jointly Learning to Align
-    and Translate** for more details about the bidirectional lstm.
-    The link goes as follows:
-    .. _Link: https://arxiv.org/pdf/1409.0473v3.pdf
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        bi_lstm = bidirectional_lstm(input=[input1], size=512)
-
-    :param name: bidirectional lstm layer name.
-    :type name: basestring
-    :param input: input layer.
-    :type input: LayerOutput
-    :param size: lstm layer size.
-    :type size: int
-    :param return_seq: If set False, the last time step of output are
-                       concatenated and returned.
-                       If set True, the entire output sequences in forward
-                       and backward directions are concatenated and returned.
-    :type return_seq: bool
-    :return: LayerOutput object.
-    :rtype: LayerOutput
-    """
-    args = locals()
-
-    fw = simple_lstm(
-        name='%s_fw' % name,
-        input=input,
-        size=size,
-        **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
-               if k.startswith('fwd_')))
-
-    bw = simple_lstm(
-        name="%s_bw" % name,
-        input=input,
-        size=size,
-        reverse=True,
-        **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
-               if k.startswith('bwd_')))
-
-    if return_seq:
-        return concat_layer(
-            name=name, input=[fw, bw], layer_attr=concat_attr, act=concat_act)
-    else:
-        fw_seq = last_seq(
-            name="%s_fw_last" % name, input=fw, layer_attr=last_seq_attr)
-        bw_seq = first_seq(
-            name="%s_bw_last" % name, input=bw, layer_attr=first_seq_attr)
-        return concat_layer(
-            name=name,
-            input=[fw_seq, bw_seq],
-            layer_attr=concat_attr,
-            act=concat_act)
-
-
-@wrap_name_default()
-@wrap_act_default(param_names=['weight_act'], act=TanhActivation())
-def simple_attention(encoded_sequence,
-                     encoded_proj,
-                     decoder_state,
-                     transform_param_attr=None,
-                     softmax_param_attr=None,
-                     weight_act=None,
-                     name=None):
-    """
-    Calculate and return a context vector with attention mechanism.
-    Size of the context vector equals to size of the encoded_sequence.
-
-    ..  math::
-
-        a(s_{i-1},h_{j}) & = v_{a}f(W_{a}s_{t-1} + U_{a}h_{j})
-
-        e_{i,j} & = a(s_{i-1}, h_{j})
-
-        a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
-
-        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}h_{j}
-
-    where :math:`h_{j}` is the jth element of encoded_sequence,
-    :math:`U_{a}h_{j}` is the jth element of encoded_proj
-    :math:`s_{i-1}` is decoder_state
-    :math:`f` is weight_act, and is set to tanh by default.
-
-    Please refer to **Neural Machine Translation by Jointly Learning to
-    Align and Translate** for more details. The link is as follows:
-    https://arxiv.org/abs/1409.0473.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        context = simple_attention(encoded_sequence=enc_seq,
-                                   encoded_proj=enc_proj,
-                                   decoder_state=decoder_prev,)
-
-    :param name: name of the attention model.
-    :type name: basestring
-    :param softmax_param_attr: parameter attribute of sequence softmax
-                               that is used to produce attention weight.
-    :type softmax_param_attr: ParameterAttribute
-    :param weight_act: activation of the attention model.
-    :type weight_act: BaseActivation
-    :param encoded_sequence: output of the encoder
-    :type encoded_sequence: LayerOutput
-    :param encoded_proj: attention weight is computed by a feed forward neural
-                         network which has two inputs : decoder's hidden state
-                         of previous time step and encoder's output.
-                         encoded_proj is output of the feed-forward network for
-                         encoder's output. Here we pre-compute it outside
-                         simple_attention for speed consideration.
-    :type encoded_proj: LayerOutput
-    :param decoder_state: hidden state of decoder in previous time step
-    :type decoder_state: LayerOutput
-    :param transform_param_attr: parameter attribute of the feed-forward
-                                network that takes decoder_state as inputs to
-                                compute attention weight.
-    :type transform_param_attr: ParameterAttribute
-    :return: a context vector
-    :rtype: LayerOutput
-    """
-    assert encoded_proj.size == decoder_state.size
-    proj_size = encoded_proj.size
-
-    with mixed_layer(size=proj_size, name="%s_transform" % name) as m:
-        m += full_matrix_projection(
-            decoder_state, param_attr=transform_param_attr)
-
-    expanded = expand_layer(
-        input=m, expand_as=encoded_sequence, name='%s_expand' % name)
-
-    with mixed_layer(
-            size=proj_size, act=weight_act, name="%s_combine" % name) as m:
-        m += identity_projection(expanded)
-        m += identity_projection(encoded_proj)
-
-    # sequence softmax is used to normalize similarities between decoder state
-    # and encoder outputs into a distribution
-    attention_weight = fc_layer(
-        input=m,
-        size=1,
-        act=SequenceSoftmaxActivation(),
-        param_attr=softmax_param_attr,
-        name="%s_softmax" % name,
-        bias_attr=False)
-
-    scaled = scaling_layer(
-        weight=attention_weight,
-        input=encoded_sequence,
-        name='%s_scaling' % name)
-
-    return pooling_layer(
-        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
-
-
-@wrap_name_default()
-def dot_product_attention(encoded_sequence,
-                          attended_sequence,
-                          transformed_state,
-                          softmax_param_attr=None,
-                          name=None):
-    """
-    Calculate and return a context vector with dot-product attention mechanism.
-    The dimension of the context vector equals to that of the attended_sequence.
-
-    ..  math::
-
-        a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
-
-        e_{i,j} & = a(s_{i-1}, h_{j})
-
-        a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
-
-        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
-
-    where :math:`h_{j}` is the jth element of encoded_sequence,
-    :math:`z_{j}` is the jth element of attended_sequence,
-    :math:`s_{i-1}` is transformed_state.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        context = dot_product_attention(encoded_sequence=enc_seq,
-                                        attended_sequence=att_seq,
-                                        transformed_state=state,)
-
-    :param name: A prefix attached to the name of each layer that defined inside
-                 the dot_product_attention.
-    :type name: basestring
-    :param softmax_param_attr: The parameter attribute of sequence softmax
-                               that is used to produce attention weight.
-    :type softmax_param_attr: ParameterAttribute
-    :param encoded_sequence: The output hidden vectors of the encoder.
-    :type encoded_sequence: LayerOutput
-    :param attended_sequence: The attention weight is computed by a feed forward neural
-                              network which has two inputs : decoder's transformed hidden
-                              state of previous time step and encoder's output.
-                              attended_sequence is the sequence to be attended.
-    :type attended_sequence: LayerOutput
-    :param transformed_state: The transformed hidden state of decoder in previous time step.
-                              Since the dot-product operation will be performed on it and the
-                              encoded_sequence, their dimensions must be equal. For flexibility,
-                              we suppose transformations of the decoder's hidden state have been
-                              done outside dot_product_attention and no more will be performed
-                              inside. Then users can use either the original or transformed one.
-    :type transformed_state: LayerOutput
-    :return: The context vector.
-    :rtype: LayerOutput
-    """
-    assert transformed_state.size == encoded_sequence.size
-
-    expanded = expand_layer(
-        input=transformed_state,
-        expand_as=encoded_sequence,
-        name='%s_expand' % name)
-
-    m = dot_prod_layer(
-        input1=expanded, input2=encoded_sequence, name='%s_dot-product' % name)
-
-    attention_weight = fc_layer(
-        input=m,
-        size=1,
-        act=SequenceSoftmaxActivation(),
-        param_attr=softmax_param_attr,
-        name="%s_softmax" % name,
-        bias_attr=False)
-
-    scaled = scaling_layer(
-        weight=attention_weight,
-        input=attended_sequence,
-        name='%s_scaling' % name)
-
-    return pooling_layer(
-        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
-
-
-@wrap_name_default()
-def multi_head_attention(query,
-                         key,
-                         value,
-                         key_proj_size,
-                         value_proj_size,
-                         head_num,
-                         attention_type,
-                         softmax_param_attr=None,
-                         name=None):
-    """
-    Calculate and return a context vector with dot-product attention mechanism.
-    The dimension of the context vector equals to value_proj_size * head_num.
-
-    Please refer to **Attention Is All You Need** for more details. The link is
-    as follows:
-    https://arxiv.org/abs/1706.03762.
-
-    The example usage is:
-
-    ..  code-block:: python
-
-        context = multi_head_attention(query=decoder_state,
-                                       key=enc_seq,
-                                       value=enc_seq,
-                                       key_proj_size=64,
-                                       value_pro_size=64,
-                                       head_num=8,
-                                       attention_type='dot-product attention')
-
-    :param name: A prefix attached to the name of each layer that defined inside
-                 the multi_head_attention.
-    :type name: basestring
-    :param softmax_param_attr: The parameter attribute of sequence softmax
-                               that is used to produce attention weight.
-    :type softmax_param_attr: ParameterAttribute
-    :param query: query is used to calculate attention weights over values at current step.
-    :type query: LayerOutput
-    :param key: key is used to calculate the attention weight of the corresponding value.
-    :type key: LayerOutput
-    :param value: value is the sequence to be attended.
-    :type value: LayerOutput
-    :param key_proj_size: The dimension of the linear projection performed on key and query.
-    :type key_proj_size: int
-    :param value_proj_size: The dimension of the linear projection performed on value.
-    :type value_proj_size: int
-    :param head_num: The number of attention heads.
-    :type head_num: int
-    :param attention_type: The type of the attention mechanism used in each attention
-                           heads. Now, we only support scaled dot-product attention and
-                           additive attention.
-    :type attention_type: basestring
-    :return: The context vector.
-    :rtype: LayerOutput
-    """
-    assert attention_type in ['dot-product attention', 'additive attention']
-
-    with mixed_layer(
-            size=key_proj_size * head_num,
-            name='%s_query_proj' % name) as query_proj:
-        query_proj += full_matrix_projection(query)
-    query_proj = expand_layer(input=query_proj, expand_as=key)
-
-    with mixed_layer(
-            size=key_proj_size * head_num,
-            name='%s_key_proj' % name) as key_proj:
-        key_proj += full_matrix_projection(key)
-
-    with mixed_layer(
-            size=value_proj_size * head_num,
-            name='%s_value_proj' % name) as value_proj:
-        value_proj += full_matrix_projection(value)
-
-    head_list = []
-    for i in range(head_num):
-        with mixed_layer(size=key_proj_size) as sub_query_proj:
-            sub_query_proj += identity_projection(
-                query_proj, offset=key_proj_size * i, size=key_proj_size)
-
-        with mixed_layer(size=key_proj_size) as sub_key_proj:
-            sub_key_proj += identity_projection(
-                key_proj, offset=key_proj_size * i, size=key_proj_size)
-
-        with mixed_layer(size=value_proj_size) as sub_value_proj:
-            sub_value_proj += identity_projection(
-                value_proj, offset=value_proj_size * i, size=value_proj_size)
-
-        if attention_type == 'dot-product attention':
-            m = dot_prod_layer(
-                input1=sub_query_proj,
-                input2=sub_key_proj,
-                name='%s_dot-product_%d' % (name, i))
-            m = slope_intercept_layer(
-                input=m,
-                slope=math.sqrt(1.0 / key_proj_size),
-                name='%s_dot-product_scaling_%d' % (name, i))
-        else:
-            with mixed_layer(
-                    size=key_proj_size,
-                    act=TanhActivation(),
-                    name='%s_combine_%d' % (name, i)) as m:
-                m += identity_projection(sub_query_proj)
-                m += identity_projection(sub_key_proj)
-
-        attention_weight = fc_layer(
-            input=m,
-            size=1,
-            act=SequenceSoftmaxActivation(),
-            param_attr=softmax_param_attr,
-            name="%s_softmax_%d" % (name, i),
-            bias_attr=False)
-
-        scaled = scaling_layer(
-            weight=attention_weight,
-            input=sub_value_proj,
-            name='%s_scaling_%d' % (name, i))
-        head = pooling_layer(
-            input=scaled,
-            pooling_type=SumPooling(),
-            name="%s_pooling_%d" % (name, i))
-
-        head_list.append(head)
-
-    attended = concat_layer(head_list)
-
-    return attended
-
-
-def inputs(layers, *args):
-    """
-    Declare the inputs of network. The order of input should be as same as
-    the data provider's return order.
-
-    :param layers: Input Layers.
-    :type layers: list|tuple|LayerOutput.
-    :return:
-    """
-
-    if isinstance(layers, LayerOutput) or isinstance(layers, basestring):
-        layers = [layers]
-    if len(args) != 0:
-        layers.extend(args)
-
-    Inputs(*[l.name for l in layers])
-
-
-def outputs(layers, *args):
-    """
-    Declare the outputs of network. If user has not defined the inputs of
-    network, this method will calculate the input order by dfs travel.
-
-    :param layers: Output layers.
-    :type layers: list|tuple|LayerOutput
-    :return:
-    """
-
-    traveled = set()
-
-    def __dfs_travel__(layer,
-                       predicate=lambda x: x.layer_type == LayerType.DATA):
-        """
-        DFS LRV Travel for output layer.
-
-        The return order is define order for data_layer in this leaf node.
-
-        :param layer:
-        :type layer: LayerOutput
-        :return:
-        """
-        if layer in traveled:
-            return []
-        else:
-            traveled.add(layer)
-
-        assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
-        retv = []
-        if layer.parents is not None:
-            for p in layer.parents:
-                retv.extend(__dfs_travel__(p, predicate))
-
-        if predicate(layer):
-            retv.append(layer)
-        return retv
-
-    if isinstance(layers, LayerOutput):
-        layers = [layers]
-
-    if len(args) != 0:
-        layers.extend(args)
-
-    assert len(layers) > 0
-
-    if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
-        return  # just return outputs.
-
-    if len(layers) != 1:
-        logger.warning("`outputs` routine try to calculate network's"
-                       " inputs and outputs order. It might not work well."
-                       "Please see follow log carefully.")
-    inputs = []
-    outputs_ = []
-    for each_layer in layers:
-        assert isinstance(each_layer, LayerOutput)
-        inputs.extend(__dfs_travel__(each_layer))
-        outputs_.extend(
-            __dfs_travel__(each_layer,
-                           lambda x: x.layer_type == LayerType.COST))
-
-    # Currently, we got each leaf node's inputs order, output order.
-    # We merge them together.
-
-    final_inputs = []
-    final_outputs = []
-
-    for each_input in inputs:
-        assert isinstance(each_input, LayerOutput)
-        if each_input.name not in final_inputs:
-            final_inputs.append(each_input.name)
-
-    for each_output in outputs_:
-        assert isinstance(each_output, LayerOutput)
-        if each_output.name not in final_outputs:
-            final_outputs.append(each_output.name)
-
-    logger.info("".join(["The input order is [", ", ".join(final_inputs), "]"]))
-
-    if len(final_outputs) == 0:
-        final_outputs = map(lambda x: x.name, layers)
-
-    logger.info("".join(
-        ["The output order is [", ", ".join(final_outputs), "]"]))
-
-    Inputs(*final_inputs)
-    Outputs(*final_outputs)
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
deleted file mode 100644
index 32698e5b2..000000000
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import Settings, default_decay_rate, \
-    default_gradient_clipping_threshold, default_momentum
-
-from .default_decorators import wrap_param_default
-
-__all__ = [
-    'Optimizer', 'BaseSGDOptimizer', 'MomentumOptimizer', 'AdamaxOptimizer',
-    'AdamOptimizer', 'AdaGradOptimizer', 'RMSPropOptimizer',
-    'DecayedAdaGradOptimizer', 'AdaDeltaOptimizer', 'BaseRegularization',
-    'L2Regularization', 'settings', 'ModelAverage'
-]
-
-
-class Optimizer(object):
-    def to_setting_kwargs(self):
-        raise NotImplementedError()
-
-    def extra_settings(self):
-        pass
-
-    @property
-    def is_support_sparse(self):
-        return True
-
-
-class BaseSGDOptimizer(Optimizer):
-    """
-    SGD Optimizer.
-
-    SGD is an optimization method, trying to find a neural network that
-    minimize the "cost/error" of it by iteration. In paddle's implementation
-    SGD Optimizer is synchronized, which means all gradients will be wait to
-    calculate and reduced into one gradient, then do optimize operation.
-
-    The neural network consider the learning problem of minimizing an objective
-    function, that has the form of a sum
-
-    ..  math::
-
-        Q(w) = \\sum_{i}^{n} Q_i(w)
-
-    The value of function Q sometimes is the cost of neural network (Mean
-    Square Error between prediction and label for example). The function Q is
-    parametrised by w, the weight/bias of neural network. And weights is what to
-    be learned. The i is the i-th observation in (trainning) data.
-
-    So, the SGD method will optimize the weight by
-
-    ..  math::
-
-        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
-
-    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
-    """
-
-    def to_setting_kwargs(self):
-        raise NotImplementedError()
-
-
-class MomentumOptimizer(BaseSGDOptimizer):
-    """
-    MomentumOptimizer.
-
-    When sparse=True, the update scheme:
-
-    ..  math::
-
-        \\alpha_t &= \\alpha_{t-1} / k \\\\
-        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
-        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
-        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
-        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
-    
-    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
-    :math:`\\gamma_t` is learning rate at the t'th step.
-
-    :param sparse: with sparse support or not.
-    :type sparse: bool
-    """
-
-    def extra_settings(self):
-        default_momentum(self.momentum)
-
-    def to_setting_kwargs(self):
-        if self.sparse:
-            return {'learning_method': 'sparse_momentum'}
-        else:
-            return {'learning_method': 'momentum'}
-
-    def __init__(self, momentum=None, sparse=False):
-        self.momentum = momentum
-        self.sparse = sparse
-
-
-class AdamOptimizer(BaseSGDOptimizer):
-    """
-    Adam optimizer.
-    The details of please refer `Adam: A Method for Stochastic Optimization
-    <https://arxiv.org/abs/1412.6980>`_
-
-    ..  math::
-
-        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
-        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
-
-    :param beta1: the :math:`\\beta_1` in equation.
-    :type beta1: float
-    :param beta2: the :math:`\\beta_2` in equation.
-    :type beta2: float
-    :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
-                        divided by zero.
-    :type epsilon: float
-    """
-
-    @property
-    def is_support_sparse(self):
-        return False
-
-    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8):
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'adam',
-            'adam_beta1': self.beta1,
-            'adam_beta2': self.beta2,
-            'adam_epsilon': self.epsilon
-        }
-
-
-class AdamaxOptimizer(BaseSGDOptimizer):
-    """
-    Adamax optimizer.
-
-    The details of please refer this `Adam: A Method for Stochastic Optimization
-    <https://arxiv.org/abs/1412.6980>`_
-
-    ..  math::
-
-        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
-        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
-        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
-
-    :param beta1: the :math:`\\beta_1` in the equation.
-    :type beta1: float
-    :param beta2: the :math:`\\beta_2` in the equation.
-    :type beta2: float
-    """
-
-    def __init__(self, beta1, beta2):
-        self.beta1 = beta1
-        self.beta2 = beta2
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'adamax',
-            'adam_beta1': self.beta1,
-            'adam_beta2': self.beta2
-        }
-
-    @property
-    def is_support_sparse(self):
-        return False
-
-
-class AdaGradOptimizer(BaseSGDOptimizer):
-    """
-    Adagrad(for ADAptive GRAdient algorithm) optimizer.
-
-    For details please refer this `Adaptive Subgradient Methods for
-    Online Learning and Stochastic Optimization
-    <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.
-
-    ..  math::
-
-        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
-        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
-    """
-
-    def to_setting_kwargs(self):
-        return {'learning_method': 'adagrad'}
-
-    def __init__(self):
-        pass
-
-
-class RMSPropOptimizer(BaseSGDOptimizer):
-    """
-    RMSProp(for Root Mean Square Propagation) optimizer. For details please
-    refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
-    lecture_slides_lec6.pdf>`_.
-
-    The equations of this method as follows:
-
-    ..  math::
-
-        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
-        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
-
-    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
-    :type rho: float
-    :param epsilon: the :math:`\\epsilon` in the equation.
-    :type epsilon: float
-    """
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'rmsprop',
-            'ada_rou': self.rho,
-            'ada_epsilon': self.epsilon
-        }
-
-    def __init__(self, rho=0.95, epsilon=1e-6):
-        self.rho = rho
-        self.epsilon = epsilon
-
-
-class DecayedAdaGradOptimizer(BaseSGDOptimizer):
-    """
-    AdaGrad method with decayed sum gradients. The equations of this method
-    show as follow.
-
-    ..  math::
-
-        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
-        learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )
-
-    :param rho: The :math:`\\rho` parameter in that equation
-    :type rho: float
-    :param epsilon: The :math:`\\epsilon` parameter in that equation.
-    :type epsilon: float
-    """
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'decayed_adagrad',
-            'ada_rou': self.rho,
-            'ada_epsilon': self.epsilon
-        }
-
-    def __init__(self, rho=0.95, epsilon=1e-6):
-        self.rho = rho
-        self.epsilon = epsilon
-
-
-class AdaDeltaOptimizer(BaseSGDOptimizer):
-    """
-    AdaDelta method. The details of adadelta please refer to this
-    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
-    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
-
-    ..  math::
-
-        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
-        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
-                          E(g_t^2) + \\epsilon ) ) \\\\
-        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
-
-    :param rho: :math:`\\rho` in equation
-    :type rho: float
-    :param epsilon: :math:`\\rho` in equation
-    :type epsilon: float
-    """
-
-    def to_setting_kwargs(self):
-        return {
-            'learning_method': 'adadelta',
-            'ada_rou': self.rho,
-            'ada_epsilon': self.epsilon
-        }
-
-    def __init__(self, rho=0.95, epsilon=1e-6):
-        self.rho = rho
-        self.epsilon = epsilon
-
-
-class BaseRegularization(Optimizer):
-    def __init__(self):
-        self.algorithm = ""
-        self.learning_method = ""
-
-    def to_setting_kwargs(self):
-        return {}
-
-
-class L2Regularization(BaseRegularization):
-    def __init__(self, rate):
-        super(L2Regularization, self).__init__()
-        self.decay_rate = rate
-
-    def to_setting_kwargs(self):
-        if self.algorithm == 'owlqn':
-            return {'l2weight': self.decay_rate}
-        else:
-            return dict()
-
-    def extra_settings(self):
-        if self.algorithm == 'sgd' or self.algorithm == 'async_sgd':
-            default_decay_rate(self.decay_rate)
-
-
-class ModelAverage(Optimizer):
-    def to_setting_kwargs(self):
-        return {
-            'average_window': self.average_window,
-            'max_average_window': self.max_average_window,
-            'do_average_in_cpu': self.do_average_in_cpu
-        }
-
-    def __init__(self,
-                 average_window,
-                 max_average_window=None,
-                 do_average_in_cpu=False):
-        self.average_window = average_window
-        self.max_average_window = max_average_window
-        self.do_average_in_cpu = do_average_in_cpu
-
-
-class GradientClippingThreshold(Optimizer):
-    def extra_settings(self):
-        default_gradient_clipping_threshold(self.threshold)
-
-    def __init__(self, threshold):
-        self.threshold = threshold
-
-    def to_setting_kwargs(self):
-        return dict()
-
-
-def __extends__(dict1, dict2):
-    for key in dict2:
-        assert key not in dict1
-        dict1[key] = dict2[key]
-    return dict1
-
-
-@wrap_param_default(
-    ['learning_method'], default_factory=lambda _: MomentumOptimizer())
-@wrap_param_default(
-    ['regularization'], default_factory=lambda _: BaseRegularization())
-def settings(batch_size,
-             learning_rate=1e-3,
-             learning_rate_decay_a=0.,
-             learning_rate_decay_b=0.,
-             learning_rate_schedule='poly',
-             learning_rate_args='',
-             async_lagged_grad_discard_ratio=1.5,
-             learning_method=None,
-             regularization=None,
-             is_async=False,
-             model_average=None,
-             gradient_clipping_threshold=None):
-    """
-    Set the optimization method, learning rate, batch size, and other training
-    settings. The currently supported algorithms are SGD and Async-SGD.
-
-    ..  warning::
-
-        Note that the 'batch_size' in PaddlePaddle is not equal to global
-        training batch size. It represents the single training process's batch
-        size. If you use N processes to train one model, for example use three
-        GPU machines, the global batch size is N*'batch_size'.
-
-    :param batch_size: batch size for one training process.
-    :type batch_size: int
-    :param learning_rate: learning rate for SGD
-    :type learning_rate: float
-    :param learning_method: The extension optimization algorithms of gradient
-                            descent, such as momentum, adagrad, rmsprop, etc.
-                            Note that it should be instance with base type
-                            BaseSGDOptimizer.
-    :type learning_method: BaseSGDOptimizer
-    :param regularization: The regularization method.
-    :type regularization: BaseRegularization
-    :param is_async: Is Async-SGD or not. Default value is False.
-    :type is_async: bool
-    :param model_average: Model Average Settings.
-    :type model_average: ModelAverage
-    :param gradient_clipping_threshold: gradient clipping threshold. If gradient
-                                        value larger than some value, will be
-                                        clipped.
-    :type gradient_clipping_threshold: float
-    :param async_lagged_grad_discard_ratio: async SGD gradient commit control,
-          when async_lagged_grad_discard_ratio * num_gradient_servers commit passed, 
-          the current async SGD gradient is discarded.
-    :type async_lagged_grad_discard_ratio: float
-    """
-    if isinstance(regularization, BaseRegularization):
-        regularization = [regularization]
-
-    assert isinstance(learning_method, Optimizer)
-    if isinstance(learning_method, BaseSGDOptimizer):
-        algorithm = 'async_sgd' if is_async else 'sgd'
-    else:
-        algorithm = 'owlqn'
-
-    args = [
-        'batch_size', 'learning_rate', 'learning_rate_decay_a',
-        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
-        'gradient_clipping_threshold', 'async_lagged_grad_discard_ratio'
-    ]
-    kwargs = dict()
-    kwargs['algorithm'] = algorithm
-    for arg in args:
-        kwargs[arg] = locals()[arg]
-
-    kwargs = __extends__(kwargs, learning_method.to_setting_kwargs())
-    learning_method.extra_settings()
-
-    for regular in regularization:
-        assert isinstance(regular, BaseRegularization)
-        regular.algorithm = algorithm
-        regular.learning_method = kwargs['learning_method']
-        kwargs = __extends__(kwargs, regular.to_setting_kwargs())
-        regular.extra_settings()
-
-    if gradient_clipping_threshold is not None:
-        gradient_clipping_threshold = GradientClippingThreshold(
-            threshold=gradient_clipping_threshold)
-
-    for each in [model_average, gradient_clipping_threshold]:
-        if each is not None:
-            assert isinstance(each, Optimizer)
-            each.algorithm = algorithm
-            each.learning_method = kwargs['learning_method']
-            kwargs = __extends__(kwargs, each.to_setting_kwargs())
-            each.extra_settings()
-
-    # Do Check?
-    Settings(**kwargs)
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
deleted file mode 100644
index e0aeb311b..000000000
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-"""
-
-__all__ = [
-    "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
-    "CudnnMaxPooling", "CudnnAvgPooling", "CudnnAvgInclPadPooling",
-    "SumPooling", "SquareRootNPooling"
-]
-
-
-class BasePoolingType(object):
-    """
-    Base Pooling Type.
-    Note these pooling types are used for sequence input, not for images.
-    Each PoolingType contains one parameter:
-
-    :param name: pooling layer type name used by paddle.
-    :type name: basestring
-
-    """
-
-    def __init__(self, name):
-        self.name = name
-
-
-class MaxPooling(BasePoolingType):
-    """
-    Max pooling.
-
-    Return the very large values for each dimension in sequence or time steps.
-
-    ..  math::
-
-        max(samples\\_of\\_a\\_sequence)
-
-    :param output_max_index: True if output sequence max index instead of max
-                             value. None means use default value in proto.
-    :type output_max_index: bool|None
-    """
-
-    def __init__(self, output_max_index=None):
-        BasePoolingType.__init__(self, "max")
-        self.output_max_index = output_max_index
-
-
-class MaxWithMaskPooling(BasePoolingType):
-    """
-    MaxWithMask pooling.
-
-    Not only return the very large values for each dimension in sequence or time steps,
-    but also the location indices of found maxinum values.
-
-    """
-
-    def __init__(self):
-        BasePoolingType.__init__(self, "max-pool-with-mask")
-
-
-class CudnnMaxPooling(BasePoolingType):
-    """
-    Cudnn max pooling only support GPU. Return the maxinum value in the
-    pooling window.
-    """
-
-    def __init__(self):
-        BasePoolingType.__init__(self, "cudnn-max-pool")
-
-
-class CudnnAvgPooling(BasePoolingType):
-    """
-    Cudnn average pooling only support GPU. Return the average value in the
-    pooling window.
-    """
-
-    def __init__(self):
-        BasePoolingType.__init__(self, "cudnn-avg-pool")
-
-
-class CudnnAvgInclPadPooling(BasePoolingType):
-    """
-    Cudnn average pooling only support GPU. Return the average value in the
-    pooling window taking into account the padding cells.
-    """
-
-    def __init__(self):
-        BasePoolingType.__init__(self, "cudnn-avg-incl-pad-pool")
-
-
-class AvgPooling(BasePoolingType):
-    """
-    Average pooling.
-
-    Return the average values for each dimension in sequence or time steps.
-
-    ..  math::
-
-        sum(samples\\_of\\_a\\_sequence)/sample\\_num
-    """
-    STRATEGY_AVG = "average"
-    STRATEGY_SUM = "sum"
-    STRATEGY_SQROOTN = "squarerootn"
-
-    def __init__(self, strategy=STRATEGY_AVG):
-        BasePoolingType.__init__(self, "average")
-        self.strategy = strategy
-
-
-class SumPooling(AvgPooling):
-    """
-    Sum pooling.
-
-    Return the sum values of each dimension in sequence or time steps.
-
-    ..  math::
-
-        sum(samples\\_of\\_a\\_sequence)
-    """
-
-    def __init__(self):
-        AvgPooling.__init__(self, AvgPooling.STRATEGY_SUM)
-
-
-class SquareRootNPooling(AvgPooling):
-    """
-    Square Root Pooling.
-
-    Return the square root values of each dimension in sequence or time steps.
-
-    ..  math::
-
-        sum(samples\\_of\\_a\\_sequence)/sqrt(sample\\_num)
-    """
-
-    def __init__(self):
-        AvgPooling.__init__(self, AvgPooling.STRATEGY_SQROOTN)
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
deleted file mode 100644
index 30e0b9906..000000000
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-#################### test_config_parser #########################
-add_test(NAME layers_test
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
-
-add_test(NAME test_reset_hook
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
-
-add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
-add_test(NAME test_layerHelpers
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
-  ${PADDLE_BINARY_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-  ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
-)
diff --git a/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp b/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
deleted file mode 100644
index 7b10e0b7a..000000000
--- a/python/paddle/trainer_config_helpers/tests/ProtobufEqualMain.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <google/protobuf/text_format.h>
-#include <google/protobuf/util/message_differencer.h>
-#include <fstream>
-#include <iostream>
-#include "TrainerConfig.pb.h"
-
-bool loadPb(google::protobuf::Message* conf, const std::string& filename) {
-  std::ifstream fin;
-  fin.open(filename.c_str());
-  if (fin.is_open()) {
-    std::string str((std::istreambuf_iterator<char>(fin)),
-                    std::istreambuf_iterator<char>());
-    bool ok = google::protobuf::TextFormat::ParseFromString(str, conf);
-    fin.close();
-    return ok;
-  } else {
-    return false;
-  }
-}
-
-int main(int argc, char** argv) {
-  std::unique_ptr<google::protobuf::Message> config1;
-  std::unique_ptr<google::protobuf::Message> config2;
-  if (argc == 3) {
-    config1.reset(new paddle::ModelConfig());
-    config2.reset(new paddle::ModelConfig());
-  } else if (argc == 4) {
-    config1.reset(new paddle::TrainerConfig());
-    config2.reset(new paddle::TrainerConfig());
-  }
-  if (!config1 || !config2) {
-    return 1;
-  } else if (!loadPb(config1.get(), argv[1])) {
-    return 2;
-  } else if (!loadPb(config2.get(), argv[2])) {
-    return 3;
-  } else {
-    if (google::protobuf::util::MessageDifferencer::ApproximatelyEquals(
-            *config1, *config2)) {
-      return 0;
-    } else {
-      return 4;
-    }
-  }
-}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/.gitignore b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
deleted file mode 100644
index c654bd41b..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-protostr/*.unittest
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
deleted file mode 100755
index 10c941f70..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-export configs=(test_repeat_layer test_fc layer_activations projections test_print_layer
-test_sequence_pooling test_lstmemory_layer test_grumemory_layer
-last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
-img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
-test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
-test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
-test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
-test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
-test_factorization_machine)
-
-export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
deleted file mode 100755
index 44a75a60c..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-set -e
-cd `dirname $0`
-
-protostr=$PWD/protostr
-. file_list.sh
-
-for conf in ${configs[*]}
-do
-    echo "Generating " $conf
-    $1 -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
-    if [ ! -f "$protostr/$conf.protostr" ]; then 
-        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
-    fi
-    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
-done
-
-for conf in ${whole_configs[*]}
-do
-    echo "Generating " $conf
-    $1 -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
-    if [ ! -f "$protostr/$conf.protostr" ]; then 
-        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
-    fi
-    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
-done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
deleted file mode 100644
index 767b64542..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-3, batch_size=1000)
-
-img = data_layer(name='image', size=256 * 256)
-
-# the parse_conv in config_parse.py is not strictly accurate when filter_size
-# is not square. So here set square filter_size.
-img_conv = img_conv_layer(
-    input=img,
-    num_channels=1,
-    num_filters=64,
-    filter_size=(32, 32),
-    padding=(1, 1),
-    dilation=(1, 1),
-    stride=(1, 1),
-    act=LinearActivation())
-img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
-
-img_norm = img_cmrnorm_layer(input=img_bn, size=32)
-
-img_pool = img_pool_layer(input=img_conv, pool_size=32, pool_type=MaxPooling())
-
-outputs(img_pool, img_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
deleted file mode 100644
index e17c8fa7c..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-3, batch_size=1000)
-
-img = data_layer(name='image', size=227 * 227)
-
-# the parse_conv in config_parse.py is not strictly accurate when filter_size
-# is not square. So here set square filter_size.
-img_conv = img_conv_layer(
-    input=img,
-    num_channels=1,
-    num_filters=64,
-    filter_size=(32, 32),
-    padding=(1, 1),
-    stride=(1, 1),
-    act=LinearActivation(),
-    trans=True)
-img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
-
-img_norm = img_cmrnorm_layer(input=img_bn, size=32)
-
-img_pool = img_pool_layer(input=img_conv, pool_size=32, pool_type=MaxPooling())
-
-outputs(img_pool, img_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
deleted file mode 100644
index 5b6d2627e..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=30)
-
-seq_op = [first_seq, last_seq]
-
-agg_level = [AggregateLevel.TO_SEQUENCE, AggregateLevel.TO_NO_SEQUENCE]
-
-opts = []
-
-for op in seq_op:
-    for al in agg_level:
-        opts.append(op(input=din, agg_level=al))
-
-for op in seq_op:
-    opts.append(
-        op(input=din, agg_level=AggregateLevel.TO_NO_SEQUENCE, stride=5))
-
-outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
deleted file mode 100644
index ac1f7e02c..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-'''
-Test all activations.
-'''
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-din = data_layer(name='input', size=100)
-
-acts = [
-    TanhActivation, SigmoidActivation, SoftmaxActivation, IdentityActivation,
-    LinearActivation, ExpActivation, ReluActivation, BReluActivation,
-    SoftReluActivation, STanhActivation, AbsActivation, SquareActivation
-]
-
-outputs([
-    fc_layer(
-        input=din, size=100, act=act(), name="layer_%d" % i)
-    for i, act in enumerate(acts)
-])
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
deleted file mode 100644
index 29dc634fb..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-x = data_layer(name='data', size=100)
-x = layer_math.exp(x)
-x = layer_math.sqrt(x)
-x = layer_math.reciprocal(x)
-x = layer_math.log(x)
-x = layer_math.abs(x)
-x = layer_math.sigmoid(x)
-x = layer_math.tanh(x)
-x = layer_math.square(x)
-x = layer_math.relu(x)
-y = 1 + x
-y = y + 1
-y = x + y
-y = y - x
-y = y - 2
-y = 2 - y
-y = 2 * y
-y = y * 3
-z = data_layer(name='data_2', size=1)
-y = y * z
-y = z * y
-y = y + z
-y = z + y
-outputs(y)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py
deleted file mode 100644
index 3b7a196d1..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-'''
-Test mixed layer, projections and operators.
-'''
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-din = data_layer(name='test', size=100)
-
-din = embedding_layer(input=din, size=256)
-
-with mixed_layer(size=100) as m1:
-    m1 += full_matrix_projection(input=din)
-
-with mixed_layer(size=100) as m2:
-    m2 += table_projection(input=m1)
-
-with mixed_layer(size=100) as m3:
-    m3 += identity_projection(input=m2)
-
-with mixed_layer(size=100) as m4:
-    m4 += dotmul_projection(input=m3)
-
-with mixed_layer() as m5:
-    m5 += context_projection(input=m4, context_len=3)
-
-with mixed_layer() as m6:
-    m6 += dotmul_operator(a=m3, b=m4)
-    m6 += scaling_projection(m3)
-
-img = data_layer(name='img', size=32 * 32)
-flt = data_layer(name='filter', size=3 * 3 * 1 * 64)
-
-with mixed_layer() as m7:
-    m7 += conv_operator(
-        img=img, filter=flt, num_filters=64, num_channels=1, filter_size=3)
-    m7 += conv_projection(img, filter_size=3, num_filters=64, num_channels=1)
-
-with mixed_layer() as m8:
-    m8 += conv_operator(
-        img=img,
-        filter=flt,
-        num_filters=64,
-        num_channels=1,
-        filter_size=3,
-        stride=2,
-        padding=1,
-        trans=True)
-    m8 += conv_projection(
-        img,
-        filter_size=3,
-        num_filters=64,
-        num_channels=1,
-        stride=2,
-        padding=1,
-        trans=True)
-end = mixed_layer(
-    input=[
-        full_matrix_projection(input=m5),
-        trans_full_matrix_projection(input=m6),
-        full_matrix_projection(input=m7), full_matrix_projection(input=m8)
-    ],
-    size=100,
-    layer_attr=ExtraAttr(
-        drop_rate=0.5, error_clipping_threshold=40))
-
-outputs(end)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
deleted file mode 100644
index 3e0f95764..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ /dev/null
@@ -1,193 +0,0 @@
-type: "nn"
-layers {
-  name: "image"
-  type: "data"
-  size: 65536
-  active_type: ""
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 3297856
-  active_type: ""
-  inputs {
-    input_layer_name: "image"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 32
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 1
-      output_x: 227
-      img_size: 256
-      caffe_mode: true
-      filter_size_y: 32
-      padding_y: 1
-      stride_y: 1
-      output_y: 227
-      img_size_y: 256
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 64
-  shared_biases: true
-  height: 227
-  width: 227
-}
-layers {
-  name: "__batch_norm_0__"
-  type: "batch_norm"
-  size: 3297856
-  active_type: "relu"
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w0"
-    image_conf {
-      channels: 64
-      img_size: 227
-      img_size_y: 227
-    }
-  }
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w1"
-  }
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w2"
-  }
-  bias_parameter_name: "___batch_norm_0__.wbias"
-  moving_average_fraction: 0.9
-  height: 227
-  width: 227
-  depth: 1
-  epsilon: 1e-05
-}
-layers {
-  name: "__crmnorm_0__"
-  type: "norm"
-  size: 3297856
-  active_type: ""
-  inputs {
-    input_layer_name: "__batch_norm_0__"
-    norm_conf {
-      norm_type: "cmrnorm-projection"
-      channels: 64
-      size: 32
-      scale: 0.0004
-      pow: 0.75
-      output_x: 227
-      img_size: 227
-      blocked: false
-      output_y: 227
-      img_size_y: 227
-    }
-  }
-  height: 227
-  width: 227
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 2458624
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 64
-      size_x: 32
-      stride: 1
-      output_x: 196
-      img_size: 227
-      padding: 0
-      size_y: 32
-      stride_y: 1
-      output_y: 196
-      img_size_y: 227
-      padding_y: 0
-    }
-  }
-  height: 196
-  width: 196
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 65536
-  initial_mean: 0.0
-  initial_std: 0.0441941738242
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 64
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w0"
-  size: 64
-  initial_mean: 1.0
-  initial_std: 0.0
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w1"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.w2"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.wbias"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "image"
-output_layer_names: "__pool_0__"
-output_layer_names: "__crmnorm_0__"
-sub_models {
-  name: "root"
-  layer_names: "image"
-  layer_names: "__conv_0__"
-  layer_names: "__batch_norm_0__"
-  layer_names: "__crmnorm_0__"
-  layer_names: "__pool_0__"
-  input_layer_names: "image"
-  output_layer_names: "__pool_0__"
-  output_layer_names: "__crmnorm_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
deleted file mode 100644
index a18a4652e..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ /dev/null
@@ -1,193 +0,0 @@
-type: "nn"
-layers {
-  name: "image"
-  type: "data"
-  size: 51529
-  active_type: ""
-}
-layers {
-  name: "__conv_0__"
-  type: "exconvt"
-  size: 4194304
-  active_type: ""
-  inputs {
-    input_layer_name: "image"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 32
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 64
-      output_x: 227
-      img_size: 256
-      caffe_mode: true
-      filter_size_y: 32
-      padding_y: 1
-      stride_y: 1
-      output_y: 227
-      img_size_y: 256
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 64
-  shared_biases: true
-  height: 256
-  width: 256
-}
-layers {
-  name: "__batch_norm_0__"
-  type: "batch_norm"
-  size: 4194304
-  active_type: "relu"
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w0"
-    image_conf {
-      channels: 64
-      img_size: 256
-      img_size_y: 256
-    }
-  }
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w1"
-  }
-  inputs {
-    input_layer_name: "__conv_0__"
-    input_parameter_name: "___batch_norm_0__.w2"
-  }
-  bias_parameter_name: "___batch_norm_0__.wbias"
-  moving_average_fraction: 0.9
-  height: 256
-  width: 256
-  depth: 1
-  epsilon: 1e-05
-}
-layers {
-  name: "__crmnorm_0__"
-  type: "norm"
-  size: 4194304
-  active_type: ""
-  inputs {
-    input_layer_name: "__batch_norm_0__"
-    norm_conf {
-      norm_type: "cmrnorm-projection"
-      channels: 64
-      size: 32
-      scale: 0.0004
-      pow: 0.75
-      output_x: 256
-      img_size: 256
-      blocked: false
-      output_y: 256
-      img_size_y: 256
-    }
-  }
-  height: 256
-  width: 256
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 3240000
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 64
-      size_x: 32
-      stride: 1
-      output_x: 225
-      img_size: 256
-      padding: 0
-      size_y: 32
-      stride_y: 1
-      output_y: 225
-      img_size_y: 256
-      padding_y: 0
-    }
-  }
-  height: 225
-  width: 225
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 65536
-  initial_mean: 0.0
-  initial_std: 0.0441941738242
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 64
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w0"
-  size: 64
-  initial_mean: 1.0
-  initial_std: 0.0
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w1"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.w2"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.wbias"
-  size: 64
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 64
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "image"
-output_layer_names: "__pool_0__"
-output_layer_names: "__crmnorm_0__"
-sub_models {
-  name: "root"
-  layer_names: "image"
-  layer_names: "__conv_0__"
-  layer_names: "__batch_norm_0__"
-  layer_names: "__crmnorm_0__"
-  layer_names: "__pool_0__"
-  input_layer_names: "image"
-  output_layer_names: "__pool_0__"
-  output_layer_names: "__crmnorm_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
deleted file mode 100644
index fee0f8e46..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
+++ /dev/null
@@ -1,102 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__first_seq_0__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  select_first: true
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_1__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_2__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-layers {
-  name: "__last_seq_2__"
-  type: "seqlastins"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-input_layer_names: "data"
-output_layer_names: "__first_seq_0__"
-output_layer_names: "__first_seq_1__"
-output_layer_names: "__last_seq_0__"
-output_layer_names: "__last_seq_1__"
-output_layer_names: "__first_seq_2__"
-output_layer_names: "__last_seq_2__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__first_seq_0__"
-  layer_names: "__first_seq_1__"
-  layer_names: "__last_seq_0__"
-  layer_names: "__last_seq_1__"
-  layer_names: "__first_seq_2__"
-  layer_names: "__last_seq_2__"
-  input_layer_names: "data"
-  output_layer_names: "__first_seq_0__"
-  output_layer_names: "__first_seq_1__"
-  output_layer_names: "__last_seq_0__"
-  output_layer_names: "__last_seq_1__"
-  output_layer_names: "__first_seq_2__"
-  output_layer_names: "__last_seq_2__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
deleted file mode 100644
index ecf39e4d3..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
+++ /dev/null
@@ -1,423 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "layer_0"
-  type: "fc"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_0.w0"
-  }
-  bias_parameter_name: "_layer_0.wbias"
-}
-layers {
-  name: "layer_1"
-  type: "fc"
-  size: 100
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_1.w0"
-  }
-  bias_parameter_name: "_layer_1.wbias"
-}
-layers {
-  name: "layer_2"
-  type: "fc"
-  size: 100
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_2.w0"
-  }
-  bias_parameter_name: "_layer_2.wbias"
-}
-layers {
-  name: "layer_3"
-  type: "fc"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_3.w0"
-  }
-  bias_parameter_name: "_layer_3.wbias"
-}
-layers {
-  name: "layer_4"
-  type: "fc"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_4.w0"
-  }
-  bias_parameter_name: "_layer_4.wbias"
-}
-layers {
-  name: "layer_5"
-  type: "fc"
-  size: 100
-  active_type: "exponential"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_5.w0"
-  }
-  bias_parameter_name: "_layer_5.wbias"
-}
-layers {
-  name: "layer_6"
-  type: "fc"
-  size: 100
-  active_type: "relu"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_6.w0"
-  }
-  bias_parameter_name: "_layer_6.wbias"
-}
-layers {
-  name: "layer_7"
-  type: "fc"
-  size: 100
-  active_type: "brelu"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_7.w0"
-  }
-  bias_parameter_name: "_layer_7.wbias"
-}
-layers {
-  name: "layer_8"
-  type: "fc"
-  size: 100
-  active_type: "softrelu"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_8.w0"
-  }
-  bias_parameter_name: "_layer_8.wbias"
-}
-layers {
-  name: "layer_9"
-  type: "fc"
-  size: 100
-  active_type: "stanh"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_9.w0"
-  }
-  bias_parameter_name: "_layer_9.wbias"
-}
-layers {
-  name: "layer_10"
-  type: "fc"
-  size: 100
-  active_type: "abs"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_10.w0"
-  }
-  bias_parameter_name: "_layer_10.wbias"
-}
-layers {
-  name: "layer_11"
-  type: "fc"
-  size: 100
-  active_type: "square"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "_layer_11.w0"
-  }
-  bias_parameter_name: "_layer_11.wbias"
-}
-parameters {
-  name: "_layer_0.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_0.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_1.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_1.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_2.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_2.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_3.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_3.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_4.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_4.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_5.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_5.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_6.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_6.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_7.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_7.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_8.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_8.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_9.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_9.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_10.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_10.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_layer_11.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_layer_11.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-output_layer_names: "layer_0"
-output_layer_names: "layer_1"
-output_layer_names: "layer_2"
-output_layer_names: "layer_3"
-output_layer_names: "layer_4"
-output_layer_names: "layer_5"
-output_layer_names: "layer_6"
-output_layer_names: "layer_7"
-output_layer_names: "layer_8"
-output_layer_names: "layer_9"
-output_layer_names: "layer_10"
-output_layer_names: "layer_11"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "layer_0"
-  layer_names: "layer_1"
-  layer_names: "layer_2"
-  layer_names: "layer_3"
-  layer_names: "layer_4"
-  layer_names: "layer_5"
-  layer_names: "layer_6"
-  layer_names: "layer_7"
-  layer_names: "layer_8"
-  layer_names: "layer_9"
-  layer_names: "layer_10"
-  layer_names: "layer_11"
-  input_layer_names: "input"
-  output_layer_names: "layer_0"
-  output_layer_names: "layer_1"
-  output_layer_names: "layer_2"
-  output_layer_names: "layer_3"
-  output_layer_names: "layer_4"
-  output_layer_names: "layer_5"
-  output_layer_names: "layer_6"
-  output_layer_names: "layer_7"
-  output_layer_names: "layer_8"
-  output_layer_names: "layer_9"
-  output_layer_names: "layer_10"
-  output_layer_names: "layer_11"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
deleted file mode 100644
index 582207741..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
+++ /dev/null
@@ -1,413 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__exp_0__"
-  type: "mixed"
-  size: 100
-  active_type: "exponential"
-  inputs {
-    input_layer_name: "data"
-    proj_conf {
-      type: "identity"
-      name: "___exp_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__sqrt_0__"
-  type: "mixed"
-  size: 100
-  active_type: "sqrt"
-  inputs {
-    input_layer_name: "__exp_0__"
-    proj_conf {
-      type: "identity"
-      name: "___sqrt_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__reciprocal_0__"
-  type: "mixed"
-  size: 100
-  active_type: "reciprocal"
-  inputs {
-    input_layer_name: "__sqrt_0__"
-    proj_conf {
-      type: "identity"
-      name: "___reciprocal_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__log_0__"
-  type: "mixed"
-  size: 100
-  active_type: "log"
-  inputs {
-    input_layer_name: "__reciprocal_0__"
-    proj_conf {
-      type: "identity"
-      name: "___log_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__abs_0__"
-  type: "mixed"
-  size: 100
-  active_type: "abs"
-  inputs {
-    input_layer_name: "__log_0__"
-    proj_conf {
-      type: "identity"
-      name: "___abs_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__sigmoid_0__"
-  type: "mixed"
-  size: 100
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__abs_0__"
-    proj_conf {
-      type: "identity"
-      name: "___sigmoid_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__tanh_0__"
-  type: "mixed"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__sigmoid_0__"
-    proj_conf {
-      type: "identity"
-      name: "___tanh_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__square_0__"
-  type: "mixed"
-  size: 100
-  active_type: "square"
-  inputs {
-    input_layer_name: "__tanh_0__"
-    proj_conf {
-      type: "identity"
-      name: "___square_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__relu_0__"
-  type: "mixed"
-  size: 100
-  active_type: "relu"
-  inputs {
-    input_layer_name: "__square_0__"
-    proj_conf {
-      type: "identity"
-      name: "___relu_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__slope_intercept_layer_0__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__relu_0__"
-  }
-  slope: 1.0
-  intercept: 1
-}
-layers {
-  name: "__slope_intercept_layer_1__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_0__"
-  }
-  slope: 1.0
-  intercept: 1
-}
-layers {
-  name: "__mixed_0__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__relu_0__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_0__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__slope_intercept_layer_1__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_0__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__slope_intercept_layer_2__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__relu_0__"
-  }
-  slope: -1.0
-  intercept: 0.0
-}
-layers {
-  name: "__mixed_1__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_0__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_1__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__slope_intercept_layer_2__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_1__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__slope_intercept_layer_3__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_1__"
-  }
-  slope: 1.0
-  intercept: -2
-}
-layers {
-  name: "__slope_intercept_layer_4__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_3__"
-  }
-  slope: -1.0
-  intercept: 0.0
-}
-layers {
-  name: "__slope_intercept_layer_5__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_4__"
-  }
-  slope: 1.0
-  intercept: 2
-}
-layers {
-  name: "__slope_intercept_layer_6__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_5__"
-  }
-  slope: 2
-  intercept: 0.0
-}
-layers {
-  name: "__slope_intercept_layer_7__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__slope_intercept_layer_6__"
-  }
-  slope: 3
-  intercept: 0.0
-}
-layers {
-  name: "data_2"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__scaling_layer_0__"
-  type: "scaling"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2"
-  }
-  inputs {
-    input_layer_name: "__slope_intercept_layer_7__"
-  }
-}
-layers {
-  name: "__scaling_layer_1__"
-  type: "scaling"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2"
-  }
-  inputs {
-    input_layer_name: "__scaling_layer_0__"
-  }
-}
-layers {
-  name: "__repeat_layer_0__"
-  type: "featmap_expand"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2"
-  }
-  num_filters: 100
-}
-layers {
-  name: "__mixed_2__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__scaling_layer_1__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_2__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__repeat_layer_0__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_2__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__repeat_layer_1__"
-  type: "featmap_expand"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2"
-  }
-  num_filters: 100
-}
-layers {
-  name: "__mixed_3__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_2__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_3__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__repeat_layer_1__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_3__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-input_layer_names: "data_2"
-input_layer_names: "data"
-output_layer_names: "__mixed_3__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__exp_0__"
-  layer_names: "__sqrt_0__"
-  layer_names: "__reciprocal_0__"
-  layer_names: "__log_0__"
-  layer_names: "__abs_0__"
-  layer_names: "__sigmoid_0__"
-  layer_names: "__tanh_0__"
-  layer_names: "__square_0__"
-  layer_names: "__relu_0__"
-  layer_names: "__slope_intercept_layer_0__"
-  layer_names: "__slope_intercept_layer_1__"
-  layer_names: "__mixed_0__"
-  layer_names: "__slope_intercept_layer_2__"
-  layer_names: "__mixed_1__"
-  layer_names: "__slope_intercept_layer_3__"
-  layer_names: "__slope_intercept_layer_4__"
-  layer_names: "__slope_intercept_layer_5__"
-  layer_names: "__slope_intercept_layer_6__"
-  layer_names: "__slope_intercept_layer_7__"
-  layer_names: "data_2"
-  layer_names: "__scaling_layer_0__"
-  layer_names: "__scaling_layer_1__"
-  layer_names: "__repeat_layer_0__"
-  layer_names: "__mixed_2__"
-  layer_names: "__repeat_layer_1__"
-  layer_names: "__mixed_3__"
-  input_layer_names: "data_2"
-  input_layer_names: "data"
-  output_layer_names: "__mixed_3__"
-  is_recurrent_layer_group: false
-}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
deleted file mode 100644
index d8bd7b9df..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ /dev/null
@@ -1,466 +0,0 @@
-type: "nn"
-layers {
-  name: "test"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__embedding_0__"
-  type: "mixed"
-  size: 256
-  active_type: ""
-  inputs {
-    input_layer_name: "test"
-    input_parameter_name: "___embedding_0__.w0"
-    proj_conf {
-      type: "table"
-      name: "___embedding_0__.w0"
-      input_size: 100
-      output_size: 256
-    }
-  }
-}
-layers {
-  name: "__mixed_0__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__embedding_0__"
-    input_parameter_name: "___mixed_0__.w0"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_0__.w0"
-      input_size: 256
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__mixed_1__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_0__"
-    input_parameter_name: "___mixed_1__.w0"
-    proj_conf {
-      type: "table"
-      name: "___mixed_1__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__mixed_2__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_1__"
-    proj_conf {
-      type: "identity"
-      name: "___mixed_2__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__mixed_3__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_2__"
-    input_parameter_name: "___mixed_3__.w0"
-    proj_conf {
-      type: "dot_mul"
-      name: "___mixed_3__.w0"
-      input_size: 100
-      output_size: 100
-    }
-  }
-}
-layers {
-  name: "__mixed_4__"
-  type: "mixed"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_3__"
-    input_parameter_name: "___mixed_4__.w0"
-    proj_conf {
-      type: "context"
-      name: "___mixed_4__.w0"
-      input_size: 100
-      output_size: 300
-      context_start: -1
-      context_length: 3
-      trainable_padding: true
-    }
-  }
-}
-layers {
-  name: "__mixed_5__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_2__"
-  }
-  inputs {
-    input_layer_name: "__mixed_2__"
-    input_parameter_name: "___mixed_5__.w1"
-    proj_conf {
-      type: "scaling"
-      name: "___mixed_5__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__mixed_3__"
-  }
-  operator_confs {
-    type: "dot_mul"
-    input_indices: 0
-    input_indices: 2
-    input_sizes: 100
-    input_sizes: 100
-    output_size: 100
-    dotmul_scale: 1
-  }
-}
-layers {
-  name: "img"
-  type: "data"
-  size: 1024
-  active_type: ""
-}
-layers {
-  name: "filter"
-  type: "data"
-  size: 576
-  active_type: ""
-}
-layers {
-  name: "__mixed_6__"
-  type: "mixed"
-  size: 57600
-  active_type: ""
-  inputs {
-    input_layer_name: "img"
-  }
-  inputs {
-    input_layer_name: "img"
-    input_parameter_name: "___mixed_6__.w1"
-    proj_conf {
-      type: "conv"
-      name: "___mixed_6__.w1"
-      input_size: 1024
-      output_size: 57600
-      conv_conf {
-        filter_size: 3
-        channels: 1
-        stride: 1
-        padding: 0
-        groups: 1
-        filter_channels: 1
-        output_x: 30
-        img_size: 32
-        caffe_mode: true
-        filter_size_y: 3
-        padding_y: 0
-        stride_y: 1
-        output_y: 30
-        img_size_y: 32
-      }
-      num_filters: 64
-    }
-  }
-  inputs {
-    input_layer_name: "filter"
-  }
-  operator_confs {
-    type: "conv"
-    input_indices: 0
-    input_indices: 2
-    input_sizes: 1024
-    input_sizes: 576
-    output_size: 57600
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 1
-      padding: 0
-      groups: 1
-      filter_channels: 1
-      output_x: 30
-      img_size: 32
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 0
-      stride_y: 1
-      output_y: 30
-      img_size_y: 32
-    }
-    num_filters: 64
-  }
-}
-layers {
-  name: "__mixed_7__"
-  type: "mixed"
-  size: 254016
-  active_type: ""
-  inputs {
-    input_layer_name: "img"
-  }
-  inputs {
-    input_layer_name: "img"
-    input_parameter_name: "___mixed_7__.w1"
-    proj_conf {
-      type: "convt"
-      name: "___mixed_7__.w1"
-      input_size: 1024
-      output_size: 254016
-      conv_conf {
-        filter_size: 3
-        channels: 1
-        stride: 2
-        padding: 1
-        groups: 1
-        filter_channels: 64
-        output_x: 32
-        img_size: 63
-        caffe_mode: true
-        filter_size_y: 3
-        padding_y: 1
-        stride_y: 2
-        output_y: 32
-        img_size_y: 63
-      }
-      num_filters: 64
-    }
-  }
-  inputs {
-    input_layer_name: "filter"
-  }
-  operator_confs {
-    type: "convt"
-    input_indices: 0
-    input_indices: 2
-    input_sizes: 1024
-    input_sizes: 576
-    output_size: 254016
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 64
-      output_x: 32
-      img_size: 63
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 32
-      img_size_y: 63
-    }
-    num_filters: 64
-  }
-}
-layers {
-  name: "__mixed_8__"
-  type: "mixed"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_4__"
-    input_parameter_name: "___mixed_8__.w0"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_8__.w0"
-      input_size: 300
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__mixed_5__"
-    input_parameter_name: "___mixed_8__.w1"
-    proj_conf {
-      type: "trans_fc"
-      name: "___mixed_8__.w1"
-      input_size: 100
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__mixed_6__"
-    input_parameter_name: "___mixed_8__.w2"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_8__.w2"
-      input_size: 57600
-      output_size: 100
-    }
-  }
-  inputs {
-    input_layer_name: "__mixed_7__"
-    input_parameter_name: "___mixed_8__.w3"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_8__.w3"
-      input_size: 254016
-      output_size: 100
-    }
-  }
-  drop_rate: 0.5
-  error_clipping_threshold: 40.0
-}
-parameters {
-  name: "___embedding_0__.w0"
-  size: 25600
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 256
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_0__.w0"
-  size: 25600
-  initial_mean: 0.0
-  initial_std: 0.0625
-  dims: 256
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_1__.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_3__.w0"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_4__.w0"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 2
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___mixed_5__.w1"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_6__.w1"
-  size: 576
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___mixed_7__.w1"
-  size: 576
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___mixed_8__.w0"
-  size: 30000
-  initial_mean: 0.0
-  initial_std: 0.057735026919
-  dims: 300
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_8__.w1"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_8__.w2"
-  size: 5760000
-  initial_mean: 0.0
-  initial_std: 0.00416666666667
-  dims: 57600
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_8__.w3"
-  size: 25401600
-  initial_mean: 0.0
-  initial_std: 0.00198412698413
-  dims: 254016
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "test"
-input_layer_names: "img"
-input_layer_names: "filter"
-output_layer_names: "__mixed_8__"
-sub_models {
-  name: "root"
-  layer_names: "test"
-  layer_names: "__embedding_0__"
-  layer_names: "__mixed_0__"
-  layer_names: "__mixed_1__"
-  layer_names: "__mixed_2__"
-  layer_names: "__mixed_3__"
-  layer_names: "__mixed_4__"
-  layer_names: "__mixed_5__"
-  layer_names: "img"
-  layer_names: "filter"
-  layer_names: "__mixed_6__"
-  layer_names: "__mixed_7__"
-  layer_names: "__mixed_8__"
-  input_layer_names: "test"
-  input_layer_names: "img"
-  input_layer_names: "filter"
-  output_layer_names: "__mixed_8__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
deleted file mode 100644
index 3e8633b07..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
+++ /dev/null
@@ -1,125 +0,0 @@
-type: "nn"
-layers {
-  name: "feature_a"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "feature_b"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "feature_a"
-    input_parameter_name: "fc_param"
-  }
-  bias_parameter_name: "bias_param"
-}
-layers {
-  name: "__fc_layer_1__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "feature_b"
-    input_parameter_name: "fc_param"
-  }
-  bias_parameter_name: "bias_param"
-}
-layers {
-  name: "__fc_layer_2__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "softmax_param"
-  }
-  inputs {
-    input_layer_name: "__fc_layer_1__"
-    input_parameter_name: "softmax_param"
-  }
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__cost_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_2__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-parameters {
-  name: "fc_param"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 200
-  dims: 200
-  initial_strategy: 1
-  initial_smart: false
-}
-parameters {
-  name: "bias_param"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "softmax_param"
-  size: 2000
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 200
-  dims: 10
-  initial_strategy: 1
-  initial_smart: false
-}
-input_layer_names: "feature_a"
-input_layer_names: "feature_b"
-input_layer_names: "label"
-output_layer_names: "__cost_0__"
-evaluators {
-  name: "classification_error_evaluator"
-  type: "classification_error"
-  input_layers: "__fc_layer_2__"
-  input_layers: "label"
-}
-sub_models {
-  name: "root"
-  layer_names: "feature_a"
-  layer_names: "feature_b"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__fc_layer_1__"
-  layer_names: "__fc_layer_2__"
-  layer_names: "label"
-  layer_names: "__cost_0__"
-  input_layer_names: "feature_a"
-  input_layer_names: "feature_b"
-  input_layer_names: "label"
-  output_layer_names: "__cost_0__"
-  evaluator_names: "classification_error_evaluator"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
deleted file mode 100644
index 7254deb36..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
+++ /dev/null
@@ -1,289 +0,0 @@
-type: "recurrent_nn"
-layers {
-  name: "data_a"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "data_b"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_0___transform"
-  type: "mixed"
-  size: 600
-  active_type: ""
-  inputs {
-    input_layer_name: "data_a"
-    input_parameter_name: "mixed_param"
-    proj_conf {
-      type: "fc"
-      name: "___simple_gru_0___transform.w0"
-      input_size: 100
-      output_size: 600
-    }
-  }
-}
-layers {
-  name: "__simple_gru_0___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-  type: "scatter_agent"
-  size: 600
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
-  type: "gru_step"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-    input_parameter_name: "gru_param"
-  }
-  inputs {
-    input_layer_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-  }
-  bias_parameter_name: "gru_bias"
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__simple_gru_0__"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_1___transform"
-  type: "mixed"
-  size: 600
-  active_type: ""
-  inputs {
-    input_layer_name: "data_b"
-    input_parameter_name: "mixed_param"
-    proj_conf {
-      type: "fc"
-      name: "___simple_gru_1___transform.w0"
-      input_size: 100
-      output_size: 600
-    }
-  }
-}
-layers {
-  name: "__simple_gru_1___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-  type: "scatter_agent"
-  size: 600
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
-  type: "gru_step"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-    input_parameter_name: "gru_param"
-  }
-  inputs {
-    input_layer_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-  }
-  bias_parameter_name: "gru_bias"
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__simple_gru_1__"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__simple_gru_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__simple_gru_1__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "__last_seq_0__"
-    input_parameter_name: "softmax_param"
-  }
-  inputs {
-    input_layer_name: "__last_seq_1__"
-    input_parameter_name: "softmax_param"
-  }
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__cost_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-parameters {
-  name: "mixed_param"
-  size: 60000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "gru_param"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "gru_bias"
-  size: 600
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 600
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "softmax_param"
-  size: 2000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data_a"
-input_layer_names: "data_b"
-input_layer_names: "label"
-output_layer_names: "__cost_0__"
-evaluators {
-  name: "classification_error_evaluator"
-  type: "classification_error"
-  input_layers: "__fc_layer_0__"
-  input_layers: "label"
-}
-sub_models {
-  name: "root"
-  layer_names: "data_a"
-  layer_names: "data_b"
-  layer_names: "__simple_gru_0___transform"
-  layer_names: "__simple_gru_0___recurrent_group"
-  layer_names: "__simple_gru_0__"
-  layer_names: "__simple_gru_1___transform"
-  layer_names: "__simple_gru_1___recurrent_group"
-  layer_names: "__simple_gru_1__"
-  layer_names: "__last_seq_0__"
-  layer_names: "__last_seq_1__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "label"
-  layer_names: "__cost_0__"
-  input_layer_names: "data_a"
-  input_layer_names: "data_b"
-  input_layer_names: "label"
-  output_layer_names: "__cost_0__"
-  evaluator_names: "classification_error_evaluator"
-  is_recurrent_layer_group: false
-}
-sub_models {
-  name: "__simple_gru_0___recurrent_group"
-  layer_names: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-  layer_names: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-  layer_names: "__simple_gru_0__@__simple_gru_0___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
-    link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-  }
-  in_links {
-    layer_name: "__simple_gru_0___transform"
-    link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-  }
-  out_links {
-    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
-    link_name: "__simple_gru_0__"
-  }
-}
-sub_models {
-  name: "__simple_gru_1___recurrent_group"
-  layer_names: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-  layer_names: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-  layer_names: "__simple_gru_1__@__simple_gru_1___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
-    link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-  }
-  in_links {
-    layer_name: "__simple_gru_1___transform"
-    link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-  }
-  out_links {
-    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
-    link_name: "__simple_gru_1__"
-  }
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
deleted file mode 100644
index 75cf23120..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ /dev/null
@@ -1,385 +0,0 @@
-type: "recurrent_nn"
-layers {
-  name: "data_a"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "data_b"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__mixed_0__"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "data_a"
-    input_parameter_name: "mixed_param"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_0__.w0"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__mixed_1__"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "data_b"
-    input_parameter_name: "mixed_param"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_1__.w0"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__lstm_group_0___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__mixed_0__@__lstm_group_0___recurrent_group"
-  type: "scatter_agent"
-  size: 400
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    proj_conf {
-      type: "identity"
-      name: "___lstm_group_0___input_recurrent.w0"
-      input_size: 400
-      output_size: 400
-    }
-  }
-  inputs {
-    input_layer_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    input_parameter_name: "lstm_param"
-    proj_conf {
-      type: "fc"
-      name: "___lstm_group_0___input_recurrent.w1"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-  type: "lstm_step"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  }
-  inputs {
-    input_layer_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  }
-  bias_parameter_name: "lstm_bias"
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-  type: "get_output"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    input_layer_argument: "state"
-  }
-}
-layers {
-  name: "__lstm_group_0__"
-  type: "gather_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_1___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__mixed_1__@__lstm_group_1___recurrent_group"
-  type: "scatter_agent"
-  size: 400
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_1__@__lstm_group_1___recurrent_group"
-    proj_conf {
-      type: "identity"
-      name: "___lstm_group_1___input_recurrent.w0"
-      input_size: 400
-      output_size: 400
-    }
-  }
-  inputs {
-    input_layer_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-    input_parameter_name: "lstm_param"
-    proj_conf {
-      type: "fc"
-      name: "___lstm_group_1___input_recurrent.w1"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-  type: "lstm_step"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
-  }
-  inputs {
-    input_layer_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-  }
-  bias_parameter_name: "lstm_bias"
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
-  type: "get_output"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-    input_layer_argument: "state"
-  }
-}
-layers {
-  name: "__lstm_group_1__"
-  type: "gather_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_1__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "__last_seq_0__"
-    input_parameter_name: "softmax_param"
-  }
-  inputs {
-    input_layer_name: "__last_seq_1__"
-    input_parameter_name: "softmax_param"
-  }
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__cost_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-parameters {
-  name: "mixed_param"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 400
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "lstm_param"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 400
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "lstm_bias"
-  size: 300
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "softmax_param"
-  size: 1000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data_a"
-input_layer_names: "data_b"
-input_layer_names: "label"
-output_layer_names: "__cost_0__"
-evaluators {
-  name: "classification_error_evaluator"
-  type: "classification_error"
-  input_layers: "__fc_layer_0__"
-  input_layers: "label"
-}
-sub_models {
-  name: "root"
-  layer_names: "data_a"
-  layer_names: "data_b"
-  layer_names: "__mixed_0__"
-  layer_names: "__mixed_1__"
-  layer_names: "__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__"
-  layer_names: "__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1__"
-  layer_names: "__last_seq_0__"
-  layer_names: "__last_seq_1__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "label"
-  layer_names: "__cost_0__"
-  input_layer_names: "data_a"
-  input_layer_names: "data_b"
-  input_layer_names: "label"
-  output_layer_names: "__cost_0__"
-  evaluator_names: "classification_error_evaluator"
-  is_recurrent_layer_group: false
-}
-sub_models {
-  name: "__lstm_group_0___recurrent_group"
-  layer_names: "__mixed_0__@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  }
-  memories {
-    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  }
-  in_links {
-    layer_name: "__mixed_0__"
-    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-  }
-  out_links {
-    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0__"
-  }
-}
-sub_models {
-  name: "__lstm_group_1___recurrent_group"
-  layer_names: "__mixed_1__@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-  layer_names: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-    link_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-  }
-  memories {
-    layer_name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
-    link_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-  }
-  in_links {
-    layer_name: "__mixed_1__"
-    link_name: "__mixed_1__@__lstm_group_1___recurrent_group"
-  }
-  out_links {
-    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
-    link_name: "__lstm_group_1__"
-  }
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
deleted file mode 100644
index 0d51f70ee..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
+++ /dev/null
@@ -1,424 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__recurrent_layer_0__"
-  type: "recurrent"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___recurrent_layer_0__.w0"
-  }
-  bias_parameter_name: "___recurrent_layer_0__.wbias"
-  reversed: false
-}
-layers {
-  name: "__recurrent_layer_1__"
-  type: "recurrent"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___recurrent_layer_1__.w0"
-  }
-  bias_parameter_name: "___recurrent_layer_1__.wbias"
-  reversed: true
-}
-layers {
-  name: "__fc_layer_1__"
-  type: "fc"
-  size: 800
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_1__.w0"
-  }
-}
-layers {
-  name: "__lstmemory_0__"
-  type: "lstmemory"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_1__"
-    input_parameter_name: "___lstmemory_0__.w0"
-  }
-  bias_parameter_name: "___lstmemory_0__.wbias"
-  reversed: false
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__fc_layer_2__"
-  type: "fc"
-  size: 800
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_2__.w0"
-  }
-}
-layers {
-  name: "__lstmemory_1__"
-  type: "lstmemory"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_2__"
-    input_parameter_name: "___lstmemory_1__.w0"
-  }
-  bias_parameter_name: "___lstmemory_1__.wbias"
-  reversed: true
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__fc_layer_3__"
-  type: "fc"
-  size: 600
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_3__.w0"
-  }
-}
-layers {
-  name: "__gru_0__"
-  type: "gated_recurrent"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_3__"
-    input_parameter_name: "___gru_0__.w0"
-  }
-  bias_parameter_name: "___gru_0__.wbias"
-  reversed: false
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__fc_layer_4__"
-  type: "fc"
-  size: 600
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_4__.w0"
-  }
-}
-layers {
-  name: "__gru_1__"
-  type: "gated_recurrent"
-  size: 200
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_4__"
-    input_parameter_name: "___gru_1__.w0"
-  }
-  bias_parameter_name: "___gru_1__.wbias"
-  reversed: true
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__recurrent_layer_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__recurrent_layer_1__"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstmemory_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_1__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstmemory_1__"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__last_seq_2__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__gru_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__first_seq_2__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__gru_1__"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___recurrent_layer_0__.w0"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___recurrent_layer_0__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___recurrent_layer_1__.w0"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___recurrent_layer_1__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_1__.w0"
-  size: 160000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 800
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_0__.w0"
-  size: 160000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_0__.wbias"
-  size: 1400
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1400
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_2__.w0"
-  size: 160000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 800
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_1__.w0"
-  size: 160000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_1__.wbias"
-  size: 1400
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1400
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_3__.w0"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_0__.w0"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_0__.wbias"
-  size: 600
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 600
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_4__.w0"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_1__.w0"
-  size: 120000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 600
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_1__.wbias"
-  size: 600
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 600
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__last_seq_0__"
-output_layer_names: "__first_seq_0__"
-output_layer_names: "__last_seq_1__"
-output_layer_names: "__first_seq_1__"
-output_layer_names: "__last_seq_2__"
-output_layer_names: "__first_seq_2__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__recurrent_layer_0__"
-  layer_names: "__recurrent_layer_1__"
-  layer_names: "__fc_layer_1__"
-  layer_names: "__lstmemory_0__"
-  layer_names: "__fc_layer_2__"
-  layer_names: "__lstmemory_1__"
-  layer_names: "__fc_layer_3__"
-  layer_names: "__gru_0__"
-  layer_names: "__fc_layer_4__"
-  layer_names: "__gru_1__"
-  layer_names: "__last_seq_0__"
-  layer_names: "__first_seq_0__"
-  layer_names: "__last_seq_1__"
-  layer_names: "__first_seq_1__"
-  layer_names: "__last_seq_2__"
-  layer_names: "__first_seq_2__"
-  input_layer_names: "data"
-  output_layer_names: "__last_seq_0__"
-  output_layer_names: "__first_seq_0__"
-  output_layer_names: "__last_seq_1__"
-  output_layer_names: "__first_seq_1__"
-  output_layer_names: "__last_seq_2__"
-  output_layer_names: "__first_seq_2__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
deleted file mode 100644
index 9b69ae4a3..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
+++ /dev/null
@@ -1,93 +0,0 @@
-type: "nn"
-layers {
-  name: "data3D"
-  type: "data"
-  size: 360
-  active_type: ""
-  height: 6
-  width: 20
-  depth: 3
-}
-layers {
-  name: "__batch_norm_0__"
-  type: "batch_norm"
-  size: 360
-  active_type: "relu"
-  inputs {
-    input_layer_name: "data3D"
-    input_parameter_name: "___batch_norm_0__.w0"
-    image_conf {
-      channels: 1
-      img_size: 20
-      img_size_y: 6
-      img_size_z: 3
-    }
-  }
-  inputs {
-    input_layer_name: "data3D"
-    input_parameter_name: "___batch_norm_0__.w1"
-  }
-  inputs {
-    input_layer_name: "data3D"
-    input_parameter_name: "___batch_norm_0__.w2"
-  }
-  bias_parameter_name: "___batch_norm_0__.wbias"
-  moving_average_fraction: 0.9
-  height: 6
-  width: 20
-  depth: 3
-  epsilon: 1e-05
-}
-parameters {
-  name: "___batch_norm_0__.w0"
-  size: 1
-  initial_mean: 1.0
-  initial_std: 0.0
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___batch_norm_0__.w1"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.w2"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-  is_static: true
-  is_shared: true
-}
-parameters {
-  name: "___batch_norm_0__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data3D"
-output_layer_names: "__batch_norm_0__"
-sub_models {
-  name: "root"
-  layer_names: "data3D"
-  layer_names: "__batch_norm_0__"
-  input_layer_names: "data3D"
-  output_layer_names: "__batch_norm_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
deleted file mode 100644
index 8a1399efa..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
+++ /dev/null
@@ -1,155 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 120
-  active_type: ""
-}
-layers {
-  name: "__bidirectional_gru_0___fw_transform"
-  type: "mixed"
-  size: 120
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___bidirectional_gru_0___fw_transform.w0"
-    proj_conf {
-      type: "fc"
-      name: "___bidirectional_gru_0___fw_transform.w0"
-      input_size: 120
-      output_size: 120
-    }
-  }
-}
-layers {
-  name: "__bidirectional_gru_0___fw"
-  type: "gated_recurrent"
-  size: 40
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__bidirectional_gru_0___fw_transform"
-    input_parameter_name: "___bidirectional_gru_0___fw.w0"
-  }
-  bias_parameter_name: "___bidirectional_gru_0___fw.wbias"
-  reversed: false
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__bidirectional_gru_0___bw_transform"
-  type: "mixed"
-  size: 120
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___bidirectional_gru_0___bw_transform.w0"
-    proj_conf {
-      type: "fc"
-      name: "___bidirectional_gru_0___bw_transform.w0"
-      input_size: 120
-      output_size: 120
-    }
-  }
-}
-layers {
-  name: "__bidirectional_gru_0___bw"
-  type: "gated_recurrent"
-  size: 40
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__bidirectional_gru_0___bw_transform"
-    input_parameter_name: "___bidirectional_gru_0___bw.w0"
-  }
-  bias_parameter_name: "___bidirectional_gru_0___bw.wbias"
-  reversed: true
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__bidirectional_gru_0__"
-  type: "concat"
-  size: 80
-  active_type: ""
-  inputs {
-    input_layer_name: "__bidirectional_gru_0___fw"
-  }
-  inputs {
-    input_layer_name: "__bidirectional_gru_0___bw"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-parameters {
-  name: "___bidirectional_gru_0___fw_transform.w0"
-  size: 14400
-  initial_mean: 0.0
-  initial_std: 0.0912870929175
-  dims: 120
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___bidirectional_gru_0___fw.w0"
-  size: 4800
-  initial_mean: 0.0
-  initial_std: 0.158113883008
-  dims: 40
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___bidirectional_gru_0___fw.wbias"
-  size: 120
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 120
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___bidirectional_gru_0___bw_transform.w0"
-  size: 14400
-  initial_mean: 0.0
-  initial_std: 0.0912870929175
-  dims: 120
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___bidirectional_gru_0___bw.w0"
-  size: 4800
-  initial_mean: 0.0
-  initial_std: 0.158113883008
-  dims: 40
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___bidirectional_gru_0___bw.wbias"
-  size: 120
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 120
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__bidirectional_gru_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__bidirectional_gru_0___fw_transform"
-  layer_names: "__bidirectional_gru_0___fw"
-  layer_names: "__bidirectional_gru_0___bw_transform"
-  layer_names: "__bidirectional_gru_0___bw"
-  layer_names: "__bidirectional_gru_0__"
-  input_layer_names: "data"
-  output_layer_names: "__bidirectional_gru_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
deleted file mode 100644
index 25ec63237..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ /dev/null
@@ -1,137 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2304
-  active_type: ""
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 36864
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 1
-      output_x: 48
-      img_size: 48
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 48
-      img_size_y: 48
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 48
-  width: 48
-}
-layers {
-  name: "__bilinear_interp_layer_0__"
-  type: "bilinear_interp"
-  size: 65536
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    bilinear_interp_conf {
-      image_conf {
-        channels: 16
-        img_size: 48
-        img_size_y: 48
-      }
-      out_size_x: 64
-      out_size_y: 64
-    }
-  }
-  height: 64
-  width: 64
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 16384
-  active_type: ""
-  inputs {
-    input_layer_name: "__bilinear_interp_layer_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 16
-      size_x: 2
-      stride: 2
-      output_x: 32
-      img_size: 64
-      padding: 0
-      size_y: 2
-      stride_y: 2
-      output_y: 32
-      img_size_y: 64
-      padding_y: 0
-    }
-  }
-  height: 32
-  width: 32
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 384
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__pool_0__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 144
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 6291456
-  initial_mean: 0.0
-  initial_std: 0.0078125
-  dims: 16384
-  dims: 384
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data"
-output_layer_names: "__fc_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__conv_0__"
-  layer_names: "__bilinear_interp_layer_0__"
-  layer_names: "__pool_0__"
-  layer_names: "__fc_layer_0__"
-  input_layer_names: "data"
-  output_layer_names: "__fc_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
deleted file mode 100644
index 4b9578a0c..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
+++ /dev/null
@@ -1,31 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__clip_0__"
-  type: "clip"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    clip_conf {
-      min: -10
-      max: 10
-    }
-  }
-}
-input_layer_names: "input"
-output_layer_names: "__clip_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__clip_0__"
-  input_layer_names: "input"
-  output_layer_names: "__clip_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
deleted file mode 100644
index 9fe2bc29d..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
+++ /dev/null
@@ -1,132 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 36288
-  active_type: ""
-  height: 48
-  width: 42
-  depth: 6
-}
-layers {
-  name: "conv3d_1"
-  type: "conv3d"
-  size: 24192
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "_conv3d_1.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 3
-      output_x: 21
-      img_size: 42
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 24
-      img_size_y: 48
-      filter_size_z: 3
-      padding_z: 1
-      stride_z: 2
-      output_z: 3
-      img_size_z: 6
-    }
-  }
-  bias_parameter_name: "_conv3d_1.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 24
-  width: 21
-  depth: 3
-}
-layers {
-  name: "conv3d_2"
-  type: "conv3d"
-  size: 24192
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "_conv3d_2.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 3
-      output_x: 21
-      img_size: 42
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 24
-      img_size_y: 48
-      filter_size_z: 3
-      padding_z: 1
-      stride_z: 2
-      output_z: 3
-      img_size_z: 6
-    }
-  }
-  bias_parameter_name: "_conv3d_2.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 24
-  width: 21
-  depth: 3
-}
-parameters {
-  name: "_conv3d_1.w0"
-  size: 1296
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_conv3d_1.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_conv3d_2.w0"
-  size: 1296
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_conv3d_2.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "conv3d_2"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "conv3d_1"
-  layer_names: "conv3d_2"
-  input_layer_names: "data"
-  output_layer_names: "conv3d_2"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
deleted file mode 100644
index 55ab464dd..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ /dev/null
@@ -1,375 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "labels"
-  type: "data"
-  size: 5000
-  active_type: ""
-}
-layers {
-  name: "probs"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "xe-label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 4
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__ctc_layer_0__"
-  type: "ctc"
-  size: 5001
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  inputs {
-    input_layer_name: "labels"
-  }
-  norm_by_times: false
-}
-layers {
-  name: "__warp_ctc_layer_0__"
-  type: "warp_ctc"
-  size: 5001
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  inputs {
-    input_layer_name: "labels"
-  }
-  norm_by_times: false
-  blank: 0
-}
-layers {
-  name: "crf_label"
-  type: "data"
-  size: 4
-  active_type: ""
-}
-layers {
-  name: "__crf_layer_0__"
-  type: "crf"
-  size: 4
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___crf_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "crf_label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "left"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "right"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__rank_cost_0__"
-  type: "rank-cost"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "left"
-  }
-  inputs {
-    input_layer_name: "right"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "list_feature"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "list_scores"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__lambda_cost_0__"
-  type: "lambda_cost"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "list_feature"
-  }
-  inputs {
-    input_layer_name: "list_scores"
-  }
-  NDCG_num: 5
-  max_sort_size: -1
-}
-layers {
-  name: "__cross_entropy_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "probs"
-  }
-  inputs {
-    input_layer_name: "xe-label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__cross_entropy_with_selfnorm_0__"
-  type: "multi_class_cross_entropy_with_selfnorm"
-  active_type: ""
-  inputs {
-    input_layer_name: "probs"
-  }
-  inputs {
-    input_layer_name: "xe-label"
-  }
-  softmax_selfnorm_alpha: 0.1
-  coeff: 1.0
-}
-layers {
-  name: "__huber_regression_cost_0__"
-  type: "huber_regression"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  inputs {
-    input_layer_name: "labels"
-  }
-  coeff: 1.0
-  delta: 1.0
-}
-layers {
-  name: "huber_probs"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "huber_label"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__huber_classification_cost_0__"
-  type: "huber_classification"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "huber_probs"
-  }
-  inputs {
-    input_layer_name: "huber_label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__multi_binary_label_cross_entropy_0__"
-  type: "multi_binary_label_cross_entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "probs"
-  }
-  inputs {
-    input_layer_name: "xe-label"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__sum_cost_0__"
-  type: "sum_cost"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__nce_layer_0__"
-  type: "nce"
-  size: 1
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___nce_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "labels"
-  }
-  bias_parameter_name: "___nce_layer_0__.wbias"
-  num_classes: 5000
-  num_neg_samples: 10
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 800
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 4
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 4
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___crf_layer_0__.w0"
-  size: 24
-  initial_mean: 0.0
-  initial_std: 0.408248290464
-  dims: 6
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___nce_layer_0__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.0141421356237
-  dims: 5000
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___nce_layer_0__.wbias"
-  size: 5000
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 5000
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-input_layer_names: "labels"
-input_layer_names: "crf_label"
-input_layer_names: "left"
-input_layer_names: "right"
-input_layer_names: "label"
-input_layer_names: "list_feature"
-input_layer_names: "list_scores"
-input_layer_names: "probs"
-input_layer_names: "xe-label"
-input_layer_names: "huber_probs"
-input_layer_names: "huber_label"
-output_layer_names: "__ctc_layer_0__"
-output_layer_names: "__warp_ctc_layer_0__"
-output_layer_names: "__crf_layer_0__"
-output_layer_names: "__rank_cost_0__"
-output_layer_names: "__lambda_cost_0__"
-output_layer_names: "__cross_entropy_0__"
-output_layer_names: "__cross_entropy_with_selfnorm_0__"
-output_layer_names: "__huber_regression_cost_0__"
-output_layer_names: "__huber_classification_cost_0__"
-output_layer_names: "__multi_binary_label_cross_entropy_0__"
-output_layer_names: "__sum_cost_0__"
-output_layer_names: "__nce_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "labels"
-  layer_names: "probs"
-  layer_names: "xe-label"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__ctc_layer_0__"
-  layer_names: "__warp_ctc_layer_0__"
-  layer_names: "crf_label"
-  layer_names: "__crf_layer_0__"
-  layer_names: "left"
-  layer_names: "right"
-  layer_names: "label"
-  layer_names: "__rank_cost_0__"
-  layer_names: "list_feature"
-  layer_names: "list_scores"
-  layer_names: "__lambda_cost_0__"
-  layer_names: "__cross_entropy_0__"
-  layer_names: "__cross_entropy_with_selfnorm_0__"
-  layer_names: "__huber_regression_cost_0__"
-  layer_names: "huber_probs"
-  layer_names: "huber_label"
-  layer_names: "__huber_classification_cost_0__"
-  layer_names: "__multi_binary_label_cross_entropy_0__"
-  layer_names: "__sum_cost_0__"
-  layer_names: "__nce_layer_0__"
-  input_layer_names: "input"
-  input_layer_names: "labels"
-  input_layer_names: "crf_label"
-  input_layer_names: "left"
-  input_layer_names: "right"
-  input_layer_names: "label"
-  input_layer_names: "list_feature"
-  input_layer_names: "list_scores"
-  input_layer_names: "probs"
-  input_layer_names: "xe-label"
-  input_layer_names: "huber_probs"
-  input_layer_names: "huber_label"
-  output_layer_names: "__ctc_layer_0__"
-  output_layer_names: "__warp_ctc_layer_0__"
-  output_layer_names: "__crf_layer_0__"
-  output_layer_names: "__rank_cost_0__"
-  output_layer_names: "__lambda_cost_0__"
-  output_layer_names: "__cross_entropy_0__"
-  output_layer_names: "__cross_entropy_with_selfnorm_0__"
-  output_layer_names: "__huber_regression_cost_0__"
-  output_layer_names: "__huber_classification_cost_0__"
-  output_layer_names: "__multi_binary_label_cross_entropy_0__"
-  output_layer_names: "__sum_cost_0__"
-  output_layer_names: "__nce_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
deleted file mode 100644
index cec8a73db..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ /dev/null
@@ -1,162 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "weight"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__cost_0__"
-  type: "multi-class-cross-entropy"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  inputs {
-    input_layer_name: "weight"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "__square_error_cost_0__"
-  type: "square_error"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  inputs {
-    input_layer_name: "weight"
-  }
-  coeff: 1.0
-}
-layers {
-  name: "multi_class_label"
-  type: "data"
-  size: 500
-  active_type: ""
-}
-layers {
-  name: "__nce_layer_0__"
-  type: "nce"
-  size: 1
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___nce_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "multi_class_label"
-  }
-  inputs {
-    input_layer_name: "weight"
-  }
-  bias_parameter_name: "___nce_layer_0__.wbias"
-  num_classes: 500
-  num_neg_samples: 10
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 3000
-  initial_mean: 0.0
-  initial_std: 0.057735026919
-  dims: 300
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 10
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 10
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___nce_layer_0__.w0"
-  size: 5000
-  initial_mean: 0.0
-  initial_std: 0.04472135955
-  dims: 500
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___nce_layer_0__.wbias"
-  size: 500
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 500
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-input_layer_names: "label"
-input_layer_names: "weight"
-input_layer_names: "multi_class_label"
-output_layer_names: "__cost_0__"
-output_layer_names: "__square_error_cost_0__"
-output_layer_names: "__nce_layer_0__"
-evaluators {
-  name: "classification_error_evaluator"
-  type: "classification_error"
-  input_layers: "__fc_layer_0__"
-  input_layers: "label"
-  input_layers: "weight"
-}
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "label"
-  layer_names: "weight"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__cost_0__"
-  layer_names: "__square_error_cost_0__"
-  layer_names: "multi_class_label"
-  layer_names: "__nce_layer_0__"
-  input_layer_names: "input"
-  input_layer_names: "label"
-  input_layer_names: "weight"
-  input_layer_names: "multi_class_label"
-  output_layer_names: "__cost_0__"
-  output_layer_names: "__square_error_cost_0__"
-  output_layer_names: "__nce_layer_0__"
-  evaluator_names: "classification_error_evaluator"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
deleted file mode 100644
index a60256969..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
+++ /dev/null
@@ -1,207 +0,0 @@
-type: "nn"
-layers {
-  name: "sentence_states"
-  type: "data"
-  size: 32
-  active_type: ""
-}
-layers {
-  name: "sentence_scores"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__kmax_seq_score_layer_0__"
-  type: "kmax_seq_score"
-  active_type: ""
-  inputs {
-    input_layer_name: "sentence_scores"
-  }
-  beam_size: 5
-}
-layers {
-  name: "__sub_nested_seq_layer_0__"
-  type: "sub_nested_seq"
-  size: 32
-  active_type: ""
-  inputs {
-    input_layer_name: "sentence_states"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_0__"
-  }
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__sub_nested_seq_layer_0__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__kmax_seq_score_layer_1__"
-  type: "kmax_seq_score"
-  active_type: ""
-  inputs {
-    input_layer_name: "sentence_scores"
-  }
-  beam_size: 5
-}
-layers {
-  name: "__seq_slice_layer_0__"
-  type: "seq_slice"
-  size: 32
-  active_type: ""
-  inputs {
-    input_layer_name: "__sub_nested_seq_layer_0__"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_1__"
-  }
-  select_first: true
-}
-layers {
-  name: "__fc_layer_1__"
-  type: "fc"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "__seq_slice_layer_0__"
-    input_parameter_name: "___fc_layer_1__.w0"
-  }
-  bias_parameter_name: "___fc_layer_1__.wbias"
-}
-layers {
-  name: "__kmax_seq_score_layer_2__"
-  type: "kmax_seq_score"
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_1__"
-  }
-  beam_size: 5
-}
-layers {
-  name: "sentences_ids"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "start_ids"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "end_ids"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__cross_entropy_over_beam_0__"
-  type: "cross_entropy_over_beam"
-  active_type: ""
-  inputs {
-    input_layer_name: "sentence_scores"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_0__"
-  }
-  inputs {
-    input_layer_name: "sentences_ids"
-  }
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_1__"
-  }
-  inputs {
-    input_layer_name: "start_ids"
-  }
-  inputs {
-    input_layer_name: "__fc_layer_1__"
-  }
-  inputs {
-    input_layer_name: "__kmax_seq_score_layer_2__"
-  }
-  inputs {
-    input_layer_name: "end_ids"
-  }
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 32
-  initial_mean: 0.0
-  initial_std: 0.176776695297
-  dims: 32
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_1__.w0"
-  size: 32
-  initial_mean: 0.0
-  initial_std: 0.176776695297
-  dims: 32
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_1__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "sentence_scores"
-input_layer_names: "sentences_ids"
-input_layer_names: "sentence_states"
-input_layer_names: "start_ids"
-input_layer_names: "end_ids"
-output_layer_names: "__cross_entropy_over_beam_0__"
-sub_models {
-  name: "root"
-  layer_names: "sentence_states"
-  layer_names: "sentence_scores"
-  layer_names: "__kmax_seq_score_layer_0__"
-  layer_names: "__sub_nested_seq_layer_0__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__kmax_seq_score_layer_1__"
-  layer_names: "__seq_slice_layer_0__"
-  layer_names: "__fc_layer_1__"
-  layer_names: "__kmax_seq_score_layer_2__"
-  layer_names: "sentences_ids"
-  layer_names: "start_ids"
-  layer_names: "end_ids"
-  layer_names: "__cross_entropy_over_beam_0__"
-  input_layer_names: "sentence_scores"
-  input_layer_names: "sentences_ids"
-  input_layer_names: "sentence_states"
-  input_layer_names: "start_ids"
-  input_layer_names: "end_ids"
-  output_layer_names: "__cross_entropy_over_beam_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
deleted file mode 100644
index 7bf409731..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
+++ /dev/null
@@ -1,132 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 36288
-  active_type: ""
-  height: 48
-  width: 42
-  depth: 6
-}
-layers {
-  name: "deconv3d_1"
-  type: "deconv3d"
-  size: 1387760
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "_deconv3d_1.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 16
-      output_x: 42
-      img_size: 83
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 48
-      img_size_y: 95
-      filter_size_z: 3
-      padding_z: 1
-      stride_z: 2
-      output_z: 6
-      img_size_z: 11
-    }
-  }
-  bias_parameter_name: "_deconv3d_1.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 95
-  width: 83
-  depth: 11
-}
-layers {
-  name: "deconv3d_2"
-  type: "deconv3d"
-  size: 1387760
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "_deconv3d_2.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 2
-      padding: 1
-      groups: 1
-      filter_channels: 16
-      output_x: 42
-      img_size: 83
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 2
-      output_y: 48
-      img_size_y: 95
-      filter_size_z: 3
-      padding_z: 1
-      stride_z: 2
-      output_z: 6
-      img_size_z: 11
-    }
-  }
-  bias_parameter_name: "_deconv3d_2.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 95
-  width: 83
-  depth: 11
-}
-parameters {
-  name: "_deconv3d_1.w0"
-  size: 6912
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_deconv3d_1.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_deconv3d_2.w0"
-  size: 6912
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_deconv3d_2.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "deconv3d_2"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "deconv3d_1"
-  layer_names: "deconv3d_2"
-  input_layer_names: "data"
-  output_layer_names: "deconv3d_2"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
deleted file mode 100644
index 6690f9852..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
+++ /dev/null
@@ -1,66 +0,0 @@
-type: "nn"
-layers {
-  name: "input_loc"
-  type: "data"
-  size: 16
-  active_type: ""
-  height: 16
-  width: 1
-}
-layers {
-  name: "input_conf"
-  type: "data"
-  size: 8
-  active_type: ""
-  height: 1
-  width: 8
-}
-layers {
-  name: "priorbox"
-  type: "data"
-  size: 32
-  active_type: ""
-  height: 4
-  width: 8
-}
-layers {
-  name: "test_detection_output"
-  type: "detection_output"
-  size: 1400
-  active_type: ""
-  inputs {
-    input_layer_name: "priorbox"
-    detection_output_conf {
-      num_classes: 21
-      nms_threshold: 0.45
-      nms_top_k: 400
-      background_id: 0
-      input_num: 1
-      keep_top_k: 200
-      confidence_threshold: 0.01
-    }
-  }
-  inputs {
-    input_layer_name: "input_loc"
-  }
-  inputs {
-    input_layer_name: "input_conf"
-  }
-}
-input_layer_names: "priorbox"
-input_layer_names: "input_loc"
-input_layer_names: "input_conf"
-output_layer_names: "test_detection_output"
-sub_models {
-  name: "root"
-  layer_names: "input_loc"
-  layer_names: "input_conf"
-  layer_names: "priorbox"
-  layer_names: "test_detection_output"
-  input_layer_names: "priorbox"
-  input_layer_names: "input_loc"
-  input_layer_names: "input_conf"
-  output_layer_names: "test_detection_output"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
deleted file mode 100644
index f1530c382..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
+++ /dev/null
@@ -1,38 +0,0 @@
-type: "nn"
-layers {
-  name: "vector1"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "vector2"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__dot_prod_layer_0__"
-  type: "dot_prod"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "vector1"
-  }
-  inputs {
-    input_layer_name: "vector2"
-  }
-}
-input_layer_names: "vector1"
-input_layer_names: "vector2"
-output_layer_names: "__dot_prod_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "vector1"
-  layer_names: "vector2"
-  layer_names: "__dot_prod_layer_0__"
-  input_layer_names: "vector1"
-  input_layer_names: "vector2"
-  output_layer_names: "__dot_prod_layer_0__"
-  is_recurrent_layer_group: false
-}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
deleted file mode 100644
index f4b360522..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
+++ /dev/null
@@ -1,56 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "data_seq"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__expand_layer_0__"
-  type: "expand"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  inputs {
-    input_layer_name: "data_seq"
-  }
-  trans_type: "seq"
-}
-layers {
-  name: "__expand_layer_1__"
-  type: "expand"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  inputs {
-    input_layer_name: "data_seq"
-  }
-  trans_type: "non-seq"
-}
-input_layer_names: "data"
-input_layer_names: "data_seq"
-output_layer_names: "__expand_layer_0__"
-output_layer_names: "__expand_layer_1__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "data_seq"
-  layer_names: "__expand_layer_0__"
-  layer_names: "__expand_layer_1__"
-  input_layer_names: "data"
-  input_layer_names: "data_seq"
-  output_layer_names: "__expand_layer_0__"
-  output_layer_names: "__expand_layer_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
deleted file mode 100644
index 4f3002b19..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
+++ /dev/null
@@ -1,39 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 1024
-  active_type: ""
-}
-layers {
-  name: "__factorization_machine_0__"
-  type: "factorization_machine"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___factorization_machine_0__.w0"
-  }
-  factor_size: 10
-}
-parameters {
-  name: "___factorization_machine_0__.w0"
-  size: 10240
-  initial_mean: 0.0
-  initial_std: 0.03125
-  dims: 1024
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data"
-output_layer_names: "__factorization_machine_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__factorization_machine_0__"
-  input_layer_names: "data"
-  output_layer_names: "__factorization_machine_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
deleted file mode 100644
index 815189883..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
+++ /dev/null
@@ -1,98 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__trans_layer_0__"
-  type: "trans"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__trans_layer_0__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-}
-layers {
-  name: "mask"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__selective_fc_layer_0__"
-  type: "selective_fc"
-  size: 100
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___selective_fc_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "mask"
-  }
-  bias_parameter_name: "___selective_fc_layer_0__.wbias"
-  selective_fc_pass_generation: false
-  has_selected_colums: true
-  selective_fc_full_mul_ratio: 0.02
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___selective_fc_layer_0__.w0"
-  size: 10000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-  is_sparse: false
-}
-parameters {
-  name: "___selective_fc_layer_0__.wbias"
-  size: 100
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 100
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-input_layer_names: "mask"
-output_layer_names: "__fc_layer_0__"
-output_layer_names: "__selective_fc_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__trans_layer_0__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "mask"
-  layer_names: "__selective_fc_layer_0__"
-  input_layer_names: "data"
-  input_layer_names: "mask"
-  output_layer_names: "__fc_layer_0__"
-  output_layer_names: "__selective_fc_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
deleted file mode 100644
index f1e4d894a..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
+++ /dev/null
@@ -1,106 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 256
-  active_type: ""
-}
-layers {
-  name: "__gated_unit_layer_0___input_proj"
-  type: "fc"
-  size: 512
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___gated_unit_layer_0___input_proj.w0"
-  }
-  bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias"
-  error_clipping_threshold: 100.0
-}
-layers {
-  name: "__gated_unit_layer_0___gate"
-  type: "fc"
-  size: 512
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___gated_unit_layer_0___gate.w0"
-  }
-  bias_parameter_name: "___gated_unit_layer_0___gate.wbias"
-  error_clipping_threshold: 100.0
-}
-layers {
-  name: "__gated_unit_layer_0___gated_act"
-  type: "mixed"
-  size: 512
-  active_type: ""
-  inputs {
-    input_layer_name: "__gated_unit_layer_0___input_proj"
-  }
-  inputs {
-    input_layer_name: "__gated_unit_layer_0___gate"
-  }
-  error_clipping_threshold: 100.0
-  operator_confs {
-    type: "dot_mul"
-    input_indices: 0
-    input_indices: 1
-    input_sizes: 512
-    input_sizes: 512
-    output_size: 512
-    dotmul_scale: 1
-  }
-}
-parameters {
-  name: "___gated_unit_layer_0___input_proj.w0"
-  size: 131072
-  initial_mean: 0.0
-  initial_std: 0.0001
-  dims: 256
-  dims: 512
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___gated_unit_layer_0___input_proj.wbias"
-  size: 512
-  initial_mean: 0.0
-  initial_std: 1
-  dims: 1
-  dims: 512
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___gated_unit_layer_0___gate.w0"
-  size: 131072
-  initial_mean: 0.0
-  initial_std: 0.0001
-  dims: 256
-  dims: 512
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___gated_unit_layer_0___gate.wbias"
-  size: 512
-  initial_mean: 0.0
-  initial_std: 1
-  dims: 1
-  dims: 512
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-output_layer_names: "__gated_unit_layer_0___gated_act"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__gated_unit_layer_0___input_proj"
-  layer_names: "__gated_unit_layer_0___gate"
-  layer_names: "__gated_unit_layer_0___gated_act"
-  input_layer_names: "input"
-  output_layer_names: "__gated_unit_layer_0___gated_act"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
deleted file mode 100644
index 2c19b2fd1..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
+++ /dev/null
@@ -1,51 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 120
-  active_type: ""
-}
-layers {
-  name: "__gru_0__"
-  type: "gated_recurrent"
-  size: 40
-  active_type: "sigmoid"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___gru_0__.w0"
-  }
-  bias_parameter_name: "___gru_0__.wbias"
-  reversed: true
-  active_gate_type: "tanh"
-}
-parameters {
-  name: "___gru_0__.w0"
-  size: 4800
-  initial_mean: 0.0
-  initial_std: 0.158113883008
-  dims: 40
-  dims: 120
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_0__.wbias"
-  size: 120
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 120
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__gru_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__gru_0__"
-  input_layer_names: "data"
-  output_layer_names: "__gru_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
deleted file mode 100644
index e81fcb13c..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
+++ /dev/null
@@ -1,62 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__hsigmoid_0__"
-  type: "hsigmoid"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___hsigmoid_0__.w0"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  bias_parameter_name: "___hsigmoid_0__.wbias"
-  num_classes: 10
-}
-parameters {
-  name: "___hsigmoid_0__.w0"
-  size: 900
-  initial_mean: 0.0
-  initial_std: 0.333333333333
-  dims: 9
-  dims: 100
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___hsigmoid_0__.wbias"
-  size: 9
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 9
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-input_layer_names: "label"
-output_layer_names: "__hsigmoid_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "label"
-  layer_names: "__hsigmoid_0__"
-  input_layer_names: "data"
-  input_layer_names: "label"
-  output_layer_names: "__hsigmoid_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
deleted file mode 100644
index f93d368c8..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
+++ /dev/null
@@ -1,59 +0,0 @@
-type: "nn"
-layers {
-  name: "input_seq"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 1
-  active_type: "exponential"
-  inputs {
-    input_layer_name: "input_seq"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__kmax_seq_score_layer_0__"
-  type: "kmax_seq_score"
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  beam_size: 5
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 128
-  initial_mean: 0.0
-  initial_std: 0.0883883476483
-  dims: 128
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input_seq"
-output_layer_names: "__kmax_seq_score_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "input_seq"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__kmax_seq_score_layer_0__"
-  input_layer_names: "input_seq"
-  output_layer_names: "__kmax_seq_score_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
deleted file mode 100644
index 9ba33689e..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
+++ /dev/null
@@ -1,39 +0,0 @@
-type: "nn"
-layers {
-  name: "x"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "y"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "__l2_distance_layer_0__"
-  type: "l2_distance"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "x"
-  }
-  inputs {
-    input_layer_name: "y"
-  }
-}
-input_layer_names: "x"
-input_layer_names: "y"
-output_layer_names: "__l2_distance_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "x"
-  layer_names: "y"
-  layer_names: "__l2_distance_layer_0__"
-  input_layer_names: "x"
-  input_layer_names: "y"
-  output_layer_names: "__l2_distance_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
deleted file mode 100644
index 76a4afab8..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
+++ /dev/null
@@ -1,53 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "__lstmemory_0__"
-  type: "lstmemory"
-  size: 32
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___lstmemory_0__.w0"
-  }
-  bias_parameter_name: "___lstmemory_0__.wbias"
-  reversed: true
-  active_gate_type: "tanh"
-  active_state_type: "tanh"
-}
-parameters {
-  name: "___lstmemory_0__.w0"
-  size: 4096
-  initial_mean: 0.0
-  initial_std: 0.176776695297
-  dims: 32
-  dims: 32
-  dims: 4
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstmemory_0__.wbias"
-  size: 224
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 224
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__lstmemory_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__lstmemory_0__"
-  input_layer_names: "data"
-  output_layer_names: "__lstmemory_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
deleted file mode 100644
index 39dc48714..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ /dev/null
@@ -1,233 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2304
-  active_type: ""
-  height: 48
-  width: 48
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 36864
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 1
-      output_x: 48
-      img_size: 48
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 48
-      img_size_y: 48
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 48
-  width: 48
-}
-layers {
-  name: "__maxout_layer_0__"
-  type: "maxout"
-  size: 18432
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    maxout_conf {
-      image_conf {
-        channels: 16
-        img_size: 48
-        img_size_y: 48
-      }
-      groups: 2
-    }
-  }
-  height: 48
-  width: 48
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 4608
-  active_type: ""
-  inputs {
-    input_layer_name: "__maxout_layer_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 8
-      size_x: 2
-      stride: 2
-      output_x: 24
-      img_size: 48
-      padding: 0
-      size_y: 2
-      stride_y: 2
-      output_y: 24
-      img_size_y: 48
-      padding_y: 0
-    }
-  }
-  height: 24
-  width: 24
-}
-layers {
-  name: "__conv_1__"
-  type: "exconv"
-  size: 73728
-  active_type: ""
-  inputs {
-    input_layer_name: "__pool_0__"
-    input_parameter_name: "___conv_1__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 8
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 8
-      output_x: 24
-      img_size: 24
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 24
-      img_size_y: 24
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_1__.wbias"
-  num_filters: 128
-  shared_biases: true
-  height: 24
-  width: 24
-}
-layers {
-  name: "__maxout_layer_1__"
-  type: "maxout"
-  size: 18432
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_1__"
-    maxout_conf {
-      image_conf {
-        channels: 128
-        img_size: 24
-        img_size_y: 24
-      }
-      groups: 4
-    }
-  }
-  height: 24
-  width: 24
-}
-layers {
-  name: "__block_expand_layer_0__"
-  type: "blockexpand"
-  size: 192
-  active_type: ""
-  inputs {
-    input_layer_name: "__maxout_layer_1__"
-    block_expand_conf {
-      channels: 32
-      stride_x: 1
-      stride_y: 1
-      padding_x: 0
-      padding_y: 0
-      block_x: 1
-      block_y: 6
-      output_x: 0
-      output_y: 0
-      img_size_x: 0
-      img_size_y: 0
-    }
-  }
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 384
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__block_expand_layer_0__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 144
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_1__.w0"
-  size: 9216
-  initial_mean: 0.0
-  initial_std: 0.166666666667
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_1__.wbias"
-  size: 128
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 128
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 73728
-  initial_mean: 0.0
-  initial_std: 0.0721687836487
-  dims: 192
-  dims: 384
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data"
-output_layer_names: "__fc_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__conv_0__"
-  layer_names: "__maxout_layer_0__"
-  layer_names: "__pool_0__"
-  layer_names: "__conv_1__"
-  layer_names: "__maxout_layer_1__"
-  layer_names: "__block_expand_layer_0__"
-  layer_names: "__fc_layer_0__"
-  input_layer_names: "data"
-  output_layer_names: "__fc_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
deleted file mode 100644
index 0ba84dcc6..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
+++ /dev/null
@@ -1,79 +0,0 @@
-type: "nn"
-layers {
-  name: "input_loc"
-  type: "data"
-  size: 16
-  active_type: ""
-  height: 16
-  width: 1
-}
-layers {
-  name: "input_conf"
-  type: "data"
-  size: 8
-  active_type: ""
-  height: 1
-  width: 8
-}
-layers {
-  name: "priorbox"
-  type: "data"
-  size: 32
-  active_type: ""
-  height: 4
-  width: 8
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 24
-  active_type: ""
-  height: 4
-  width: 6
-}
-layers {
-  name: "test_multibox_loss"
-  type: "multibox_loss"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "priorbox"
-    multibox_loss_conf {
-      num_classes: 21
-      overlap_threshold: 0.5
-      neg_pos_ratio: 3.0
-      neg_overlap: 0.5
-      background_id: 0
-      input_num: 1
-    }
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  inputs {
-    input_layer_name: "input_loc"
-  }
-  inputs {
-    input_layer_name: "input_conf"
-  }
-}
-input_layer_names: "priorbox"
-input_layer_names: "label"
-input_layer_names: "input_loc"
-input_layer_names: "input_conf"
-output_layer_names: "test_multibox_loss"
-sub_models {
-  name: "root"
-  layer_names: "input_loc"
-  layer_names: "input_conf"
-  layer_names: "priorbox"
-  layer_names: "label"
-  layer_names: "test_multibox_loss"
-  input_layer_names: "priorbox"
-  input_layer_names: "label"
-  input_layer_names: "input_loc"
-  input_layer_names: "input_conf"
-  output_layer_names: "test_multibox_loss"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr
deleted file mode 100644
index 379842ba8..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multiplex_layer.protostr
+++ /dev/null
@@ -1,63 +0,0 @@
-type: "nn"
-layers {
-  name: "index"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "data1"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "data2"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "data3"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__multiplex_layer_0__"
-  type: "multiplex"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "index"
-  }
-  inputs {
-    input_layer_name: "data1"
-  }
-  inputs {
-    input_layer_name: "data2"
-  }
-  inputs {
-    input_layer_name: "data3"
-  }
-}
-input_layer_names: "index"
-input_layer_names: "data1"
-input_layer_names: "data2"
-input_layer_names: "data3"
-output_layer_names: "__multiplex_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "index"
-  layer_names: "data1"
-  layer_names: "data2"
-  layer_names: "data3"
-  layer_names: "__multiplex_layer_0__"
-  input_layer_names: "index"
-  input_layer_names: "data1"
-  input_layer_names: "data2"
-  input_layer_names: "data3"
-  output_layer_names: "__multiplex_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
deleted file mode 100644
index c1bfdf1b1..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
+++ /dev/null
@@ -1,225 +0,0 @@
-type: "nn"
-layers {
-  name: "w"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "a"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "b"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "c"
-  type: "data"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "d"
-  type: "data"
-  size: 31
-  active_type: ""
-}
-layers {
-  name: "__interpolation_layer_0__"
-  type: "interpolation"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "w"
-  }
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-}
-layers {
-  name: "__power_layer_0__"
-  type: "power"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "w"
-  }
-  inputs {
-    input_layer_name: "a"
-  }
-}
-layers {
-  name: "__scaling_layer_0__"
-  type: "scaling"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "w"
-  }
-  inputs {
-    input_layer_name: "a"
-  }
-}
-layers {
-  name: "__cos_sim_0__"
-  type: "cos"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-  cos_scale: 1
-}
-layers {
-  name: "__cos_sim_1__"
-  type: "cos_vm"
-  size: 2
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "c"
-  }
-  cos_scale: 1
-}
-layers {
-  name: "__sum_to_one_norm_layer_0__"
-  type: "sum_to_one_norm"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-}
-layers {
-  name: "__conv_shift_layer_0__"
-  type: "conv_shift"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "d"
-  }
-}
-layers {
-  name: "__tensor_layer_0__"
-  type: "tensor"
-  size: 1000
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-    input_parameter_name: "___tensor_layer_0__.w0"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-  bias_parameter_name: "___tensor_layer_0__.wbias"
-}
-layers {
-  name: "__slope_intercept_layer_0__"
-  type: "slope_intercept"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  slope: 0.7
-  intercept: 0.9
-}
-layers {
-  name: "__linear_comb_layer_0__"
-  type: "convex_comb"
-  size: 2
-  active_type: ""
-  inputs {
-    input_layer_name: "b"
-  }
-  inputs {
-    input_layer_name: "c"
-  }
-}
-parameters {
-  name: "___tensor_layer_0__.w0"
-  size: 10000000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 100
-  dims: 1000
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___tensor_layer_0__.wbias"
-  size: 1000
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1000
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "w"
-input_layer_names: "a"
-input_layer_names: "b"
-input_layer_names: "c"
-input_layer_names: "d"
-output_layer_names: "__interpolation_layer_0__"
-output_layer_names: "__power_layer_0__"
-output_layer_names: "__scaling_layer_0__"
-output_layer_names: "__cos_sim_0__"
-output_layer_names: "__cos_sim_1__"
-output_layer_names: "__sum_to_one_norm_layer_0__"
-output_layer_names: "__conv_shift_layer_0__"
-output_layer_names: "__tensor_layer_0__"
-output_layer_names: "__slope_intercept_layer_0__"
-output_layer_names: "__linear_comb_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "w"
-  layer_names: "a"
-  layer_names: "b"
-  layer_names: "c"
-  layer_names: "d"
-  layer_names: "__interpolation_layer_0__"
-  layer_names: "__power_layer_0__"
-  layer_names: "__scaling_layer_0__"
-  layer_names: "__cos_sim_0__"
-  layer_names: "__cos_sim_1__"
-  layer_names: "__sum_to_one_norm_layer_0__"
-  layer_names: "__conv_shift_layer_0__"
-  layer_names: "__tensor_layer_0__"
-  layer_names: "__slope_intercept_layer_0__"
-  layer_names: "__linear_comb_layer_0__"
-  input_layer_names: "w"
-  input_layer_names: "a"
-  input_layer_names: "b"
-  input_layer_names: "c"
-  input_layer_names: "d"
-  output_layer_names: "__interpolation_layer_0__"
-  output_layer_names: "__power_layer_0__"
-  output_layer_names: "__scaling_layer_0__"
-  output_layer_names: "__cos_sim_0__"
-  output_layer_names: "__cos_sim_1__"
-  output_layer_names: "__sum_to_one_norm_layer_0__"
-  output_layer_names: "__conv_shift_layer_0__"
-  output_layer_names: "__tensor_layer_0__"
-  output_layer_names: "__slope_intercept_layer_0__"
-  output_layer_names: "__linear_comb_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
deleted file mode 100644
index d5d6d31a1..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
+++ /dev/null
@@ -1,122 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2016
-  active_type: ""
-  height: 48
-  width: 42
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 32256
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 1
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 1
-      output_x: 42
-      img_size: 42
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 48
-      img_size_y: 48
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 48
-  width: 42
-}
-layers {
-  name: "__pool_0__"
-  type: "pool"
-  size: 8064
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 16
-      size_x: 2
-      stride: 2
-      output_x: 21
-      img_size: 42
-      padding: 0
-      size_y: 2
-      stride_y: 2
-      output_y: 24
-      img_size_y: 48
-      padding_y: 0
-    }
-  }
-  height: 24
-  width: 21
-}
-layers {
-  name: "__pad_0__"
-  type: "pad"
-  size: 14175
-  active_type: ""
-  inputs {
-    input_layer_name: "__pool_0__"
-    pad_conf {
-      image_conf {
-        channels: 16
-        img_size: 21
-        img_size_y: 24
-      }
-      pad_c: 2
-      pad_c: 3
-      pad_h: 1
-      pad_h: 2
-      pad_w: 3
-      pad_w: 1
-    }
-  }
-  height: 27
-  width: 25
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 144
-  initial_mean: 0.0
-  initial_std: 0.471404520791
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__pad_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__conv_0__"
-  layer_names: "__pool_0__"
-  layer_names: "__pad_0__"
-  input_layer_names: "data"
-  output_layer_names: "__pad_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
deleted file mode 100644
index 8eb98593f..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
+++ /dev/null
@@ -1,123 +0,0 @@
-type: "nn"
-layers {
-  name: "data_2d"
-  type: "data"
-  size: 6000
-  active_type: ""
-  height: 20
-  width: 10
-}
-layers {
-  name: "pool___2d"
-  type: "pool"
-  size: 840
-  active_type: ""
-  inputs {
-    input_layer_name: "data_2d"
-    pool_conf {
-      pool_type: "avg-projection"
-      channels: 30
-      size_x: 5
-      stride: 3
-      output_x: 4
-      img_size: 10
-      padding: 1
-      size_y: 5
-      stride_y: 3
-      output_y: 7
-      img_size_y: 20
-      padding_y: 1
-    }
-  }
-  height: 7
-  width: 4
-}
-layers {
-  name: "data_3d_1"
-  type: "data"
-  size: 60000
-  active_type: ""
-  height: 20
-  width: 10
-  depth: 10
-}
-layers {
-  name: "pool_3d_1"
-  type: "pool3d"
-  size: 3360
-  active_type: ""
-  inputs {
-    input_layer_name: "data_3d_1"
-    pool_conf {
-      pool_type: "avg-projection"
-      channels: 30
-      size_x: 5
-      stride: 3
-      output_x: 4
-      img_size: 10
-      padding: 1
-      size_y: 5
-      stride_y: 3
-      output_y: 7
-      img_size_y: 20
-      padding_y: 1
-      size_z: 5
-      stride_z: 3
-      output_z: 4
-      img_size_z: 10
-      padding_z: 1
-    }
-  }
-  height: 7
-  width: 4
-  depth: 4
-}
-layers {
-  name: "pool_3d_2"
-  type: "pool3d"
-  size: 3360
-  active_type: ""
-  inputs {
-    input_layer_name: "data_3d_1"
-    pool_conf {
-      pool_type: "max-projection"
-      channels: 30
-      size_x: 5
-      stride: 3
-      output_x: 4
-      img_size: 10
-      padding: 1
-      size_y: 5
-      stride_y: 3
-      output_y: 7
-      img_size_y: 20
-      padding_y: 1
-      size_z: 5
-      stride_z: 3
-      output_z: 4
-      img_size_z: 10
-      padding_z: 1
-    }
-  }
-  height: 7
-  width: 4
-  depth: 4
-}
-input_layer_names: "data_2d"
-output_layer_names: "pool___2d"
-output_layer_names: "pool_3d_1"
-output_layer_names: "pool_3d_2"
-sub_models {
-  name: "root"
-  layer_names: "data_2d"
-  layer_names: "pool___2d"
-  layer_names: "data_3d_1"
-  layer_names: "pool_3d_1"
-  layer_names: "pool_3d_2"
-  input_layer_names: "data_2d"
-  output_layer_names: "pool___2d"
-  output_layer_names: "pool_3d_1"
-  output_layer_names: "pool_3d_2"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
deleted file mode 100644
index 63fb38c65..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
+++ /dev/null
@@ -1,144 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-  height: 10
-  width: 10
-}
-layers {
-  name: "__prelu_layer_0__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_0__.w0"
-  }
-  partial_sum: 1
-  height: 10
-  width: 10
-  depth: 1
-}
-layers {
-  name: "__prelu_layer_1__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_1__.w0"
-  }
-  partial_sum: 1
-  height: 10
-  width: 10
-  depth: 1
-}
-layers {
-  name: "__prelu_layer_2__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_2__.w0"
-  }
-  partial_sum: 5
-  height: 10
-  width: 10
-  depth: 1
-}
-layers {
-  name: "__prelu_layer_3__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_3__.w0"
-  }
-  partial_sum: 300
-  height: 10
-  width: 10
-  depth: 1
-}
-layers {
-  name: "__prelu_layer_4__"
-  type: "prelu"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-    input_parameter_name: "___prelu_layer_4__.w0"
-  }
-  partial_sum: 100
-  height: 10
-  width: 10
-  depth: 1
-}
-parameters {
-  name: "___prelu_layer_0__.w0"
-  size: 300
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___prelu_layer_1__.w0"
-  size: 300
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___prelu_layer_2__.w0"
-  size: 60
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 60
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___prelu_layer_3__.w0"
-  size: 1
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___prelu_layer_4__.w0"
-  size: 3
-  initial_mean: 0.25
-  initial_std: 0.0
-  dims: 1
-  dims: 3
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "input"
-output_layer_names: "__prelu_layer_4__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__prelu_layer_0__"
-  layer_names: "__prelu_layer_1__"
-  layer_names: "__prelu_layer_2__"
-  layer_names: "__prelu_layer_3__"
-  layer_names: "__prelu_layer_4__"
-  input_layer_names: "input"
-  output_layer_names: "__prelu_layer_4__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
deleted file mode 100644
index f4cc492df..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
+++ /dev/null
@@ -1,27 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__print_0__"
-  type: "print"
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  user_arg: "layer=input %s"
-}
-input_layer_names: "input"
-output_layer_names: "input"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__print_0__"
-  input_layer_names: "input"
-  output_layer_names: "input"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
deleted file mode 100644
index 046037936..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
+++ /dev/null
@@ -1,593 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__addto_0__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  inputs {
-    input_layer_name: "data"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_1__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_0__"
-  }
-  inputs {
-    input_layer_name: "__addto_0__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_2__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_1__"
-  }
-  inputs {
-    input_layer_name: "__addto_1__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_3__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_2__"
-  }
-  inputs {
-    input_layer_name: "__addto_2__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_4__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_3__"
-  }
-  inputs {
-    input_layer_name: "__addto_3__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_5__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_4__"
-  }
-  inputs {
-    input_layer_name: "__addto_4__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_6__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_5__"
-  }
-  inputs {
-    input_layer_name: "__addto_5__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_7__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_6__"
-  }
-  inputs {
-    input_layer_name: "__addto_6__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_8__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_7__"
-  }
-  inputs {
-    input_layer_name: "__addto_7__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_9__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_8__"
-  }
-  inputs {
-    input_layer_name: "__addto_8__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_10__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_9__"
-  }
-  inputs {
-    input_layer_name: "__addto_9__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_11__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_10__"
-  }
-  inputs {
-    input_layer_name: "__addto_10__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_12__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_11__"
-  }
-  inputs {
-    input_layer_name: "__addto_11__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_13__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_12__"
-  }
-  inputs {
-    input_layer_name: "__addto_12__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_14__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_13__"
-  }
-  inputs {
-    input_layer_name: "__addto_13__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_15__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_14__"
-  }
-  inputs {
-    input_layer_name: "__addto_14__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_16__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_15__"
-  }
-  inputs {
-    input_layer_name: "__addto_15__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_17__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_16__"
-  }
-  inputs {
-    input_layer_name: "__addto_16__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_18__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_17__"
-  }
-  inputs {
-    input_layer_name: "__addto_17__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_19__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_18__"
-  }
-  inputs {
-    input_layer_name: "__addto_18__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_20__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_19__"
-  }
-  inputs {
-    input_layer_name: "__addto_19__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_21__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_20__"
-  }
-  inputs {
-    input_layer_name: "__addto_20__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_22__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_21__"
-  }
-  inputs {
-    input_layer_name: "__addto_21__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_23__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_22__"
-  }
-  inputs {
-    input_layer_name: "__addto_22__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_24__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_23__"
-  }
-  inputs {
-    input_layer_name: "__addto_23__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_25__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_24__"
-  }
-  inputs {
-    input_layer_name: "__addto_24__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_26__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_25__"
-  }
-  inputs {
-    input_layer_name: "__addto_25__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_27__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_26__"
-  }
-  inputs {
-    input_layer_name: "__addto_26__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_28__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_27__"
-  }
-  inputs {
-    input_layer_name: "__addto_27__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_29__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_28__"
-  }
-  inputs {
-    input_layer_name: "__addto_28__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_30__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_29__"
-  }
-  inputs {
-    input_layer_name: "__addto_29__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__addto_31__"
-  type: "addto"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__addto_30__"
-  }
-  inputs {
-    input_layer_name: "__addto_30__"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "fc"
-  size: 32
-  active_type: "relu"
-  inputs {
-    input_layer_name: "__addto_31__"
-    input_parameter_name: "___fc_layer_0__.w0"
-  }
-  bias_parameter_name: "___fc_layer_0__.wbias"
-}
-layers {
-  name: "__fc_layer_1__"
-  type: "fc"
-  size: 10
-  active_type: "softmax"
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-    input_parameter_name: "___fc_layer_1__.w0"
-  }
-  bias_parameter_name: "___fc_layer_1__.wbias"
-}
-parameters {
-  name: "___fc_layer_0__.w0"
-  size: 3200
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 32
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__.wbias"
-  size: 32
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 32
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_1__.w0"
-  size: 320
-  initial_mean: 0.0
-  initial_std: 0.176776695297
-  dims: 32
-  dims: 10
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_1__.wbias"
-  size: 10
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 10
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__fc_layer_1__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__addto_0__"
-  layer_names: "__addto_1__"
-  layer_names: "__addto_2__"
-  layer_names: "__addto_3__"
-  layer_names: "__addto_4__"
-  layer_names: "__addto_5__"
-  layer_names: "__addto_6__"
-  layer_names: "__addto_7__"
-  layer_names: "__addto_8__"
-  layer_names: "__addto_9__"
-  layer_names: "__addto_10__"
-  layer_names: "__addto_11__"
-  layer_names: "__addto_12__"
-  layer_names: "__addto_13__"
-  layer_names: "__addto_14__"
-  layer_names: "__addto_15__"
-  layer_names: "__addto_16__"
-  layer_names: "__addto_17__"
-  layer_names: "__addto_18__"
-  layer_names: "__addto_19__"
-  layer_names: "__addto_20__"
-  layer_names: "__addto_21__"
-  layer_names: "__addto_22__"
-  layer_names: "__addto_23__"
-  layer_names: "__addto_24__"
-  layer_names: "__addto_25__"
-  layer_names: "__addto_26__"
-  layer_names: "__addto_27__"
-  layer_names: "__addto_28__"
-  layer_names: "__addto_29__"
-  layer_names: "__addto_30__"
-  layer_names: "__addto_31__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__fc_layer_1__"
-  input_layer_names: "data"
-  output_layer_names: "__fc_layer_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
deleted file mode 100644
index e012386ff..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
+++ /dev/null
@@ -1,42 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__repeat_layer_0__"
-  type: "featmap_expand"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-  }
-  num_filters: 10
-}
-layers {
-  name: "__repeat_layer_1__"
-  type: "featmap_expand"
-  size: 300
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "data"
-  }
-  num_filters: 10
-  user_arg: "as_col_vec"
-}
-input_layer_names: "data"
-output_layer_names: "__repeat_layer_0__"
-output_layer_names: "__repeat_layer_1__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__repeat_layer_0__"
-  layer_names: "__repeat_layer_1__"
-  input_layer_names: "data"
-  output_layer_names: "__repeat_layer_0__"
-  output_layer_names: "__repeat_layer_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
deleted file mode 100644
index 9399252b2..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
+++ /dev/null
@@ -1,27 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__resize_0__"
-  type: "resize"
-  size: 150
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-}
-input_layer_names: "input"
-output_layer_names: "__resize_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__resize_0__"
-  input_layer_names: "input"
-  output_layer_names: "__resize_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
deleted file mode 100644
index 711785be3..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ /dev/null
@@ -1,738 +0,0 @@
-type: "recurrent_nn"
-layers {
-  name: "seq_input"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "sub_seq_input"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 1
-  active_type: ""
-}
-layers {
-  name: "__mixed_0__"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "seq_input"
-    input_parameter_name: "___mixed_0__.w0"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_0__.w0"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__mixed_1__"
-  type: "mixed"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "seq_input"
-    input_parameter_name: "___mixed_1__.w0"
-    proj_conf {
-      type: "fc"
-      name: "___mixed_1__.w0"
-      input_size: 100
-      output_size: 300
-    }
-  }
-}
-layers {
-  name: "__recurrent_group_0__"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "seq_input@__recurrent_group_0__"
-  type: "scatter_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "rnn_forward+delay1@__recurrent_group_0__"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "rnn_forward@__recurrent_group_0__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "seq_input@__recurrent_group_0__"
-    input_parameter_name: "_rnn_forward@__recurrent_group_0__.w0"
-  }
-  inputs {
-    input_layer_name: "rnn_forward+delay1@__recurrent_group_0__"
-    input_parameter_name: "_rnn_forward@__recurrent_group_0__.w1"
-  }
-  bias_parameter_name: "_rnn_forward@__recurrent_group_0__.wbias"
-}
-layers {
-  name: "rnn_forward"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__last_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "rnn_forward"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__recurrent_group_1__"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "seq_input@__recurrent_group_1__"
-  type: "scatter_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "rnn_back+delay1@__recurrent_group_1__"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "rnn_back@__recurrent_group_1__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "seq_input@__recurrent_group_1__"
-    input_parameter_name: "_rnn_back@__recurrent_group_1__.w0"
-  }
-  inputs {
-    input_layer_name: "rnn_back+delay1@__recurrent_group_1__"
-    input_parameter_name: "_rnn_back@__recurrent_group_1__.w1"
-  }
-  bias_parameter_name: "_rnn_back@__recurrent_group_1__.wbias"
-}
-layers {
-  name: "rnn_back"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__first_seq_0__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "rnn_back"
-  }
-  select_first: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__recurrent_group_2__"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "sub_seq_input@__recurrent_group_2__"
-  type: "scatter_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "rnn_subseq_forward@__recurrent_group_2__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "sub_seq_input@__recurrent_group_2__"
-    input_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.w0"
-  }
-  inputs {
-    input_layer_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-    input_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.w1"
-  }
-  bias_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.wbias"
-}
-layers {
-  name: "rnn_subseq_forward"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__last_seq_1__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "rnn_subseq_forward"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__lstm_group_0___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__mixed_0__@__lstm_group_0___recurrent_group"
-  type: "scatter_agent"
-  size: 400
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  type: "mixed"
-  size: 400
-  active_type: ""
-  inputs {
-    input_layer_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    proj_conf {
-      type: "identity"
-      name: "___lstm_group_0___input_recurrent.w0"
-      input_size: 400
-      output_size: 400
-    }
-  }
-  inputs {
-    input_layer_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    input_parameter_name: "___lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group.w1"
-    proj_conf {
-      type: "fc"
-      name: "___lstm_group_0___input_recurrent.w1"
-      input_size: 100
-      output_size: 400
-    }
-  }
-}
-layers {
-  name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-  type: "lstm_step"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  }
-  inputs {
-    input_layer_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  }
-  bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
-  active_gate_type: "sigmoid"
-  active_state_type: "tanh"
-}
-layers {
-  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-  type: "get_output"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    input_layer_argument: "state"
-  }
-}
-layers {
-  name: "__lstm_group_0__"
-  type: "gather_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__last_seq_2__"
-  type: "seqlastins"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__lstm_group_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__gru_group_0___recurrent_group"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "__mixed_1__@__gru_group_0___recurrent_group"
-  type: "scatter_agent"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-  type: "agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__gru_group_0__@__gru_group_0___recurrent_group"
-  type: "gru_step"
-  size: 100
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "__mixed_1__@__gru_group_0___recurrent_group"
-    input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
-  }
-  inputs {
-    input_layer_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-  }
-  bias_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
-  active_gate_type: "sigmoid"
-}
-layers {
-  name: "__gru_group_0__"
-  type: "gather_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__last_seq_3__"
-  type: "seqlastins"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "__gru_group_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__recurrent_group_3__"
-  type: "recurrent_layer_group"
-  active_type: ""
-}
-layers {
-  name: "seq_input@__recurrent_group_3__"
-  type: "scatter_agent"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__memory_6__@__recurrent_group_3__"
-  type: "agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__fc_layer_0__@__recurrent_group_3__"
-  type: "fc"
-  size: 200
-  active_type: "tanh"
-  inputs {
-    input_layer_name: "seq_input@__recurrent_group_3__"
-    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w0"
-  }
-  inputs {
-    input_layer_name: "__memory_6__@__recurrent_group_3__"
-    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w1"
-  }
-  bias_parameter_name: "___fc_layer_0__@__recurrent_group_3__.wbias"
-}
-layers {
-  name: "__fc_layer_0__"
-  type: "gather_agent"
-  size: 200
-  active_type: ""
-}
-layers {
-  name: "__last_seq_4__"
-  type: "seqlastins"
-  size: 200
-  active_type: ""
-  inputs {
-    input_layer_name: "__fc_layer_0__"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-parameters {
-  name: "___mixed_0__.w0"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 400
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___mixed_1__.w0"
-  size: 30000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 300
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_forward@__recurrent_group_0__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_forward@__recurrent_group_0__.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_forward@__recurrent_group_0__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_rnn_back@__recurrent_group_1__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_back@__recurrent_group_1__.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_back@__recurrent_group_1__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "_rnn_subseq_forward@__recurrent_group_2__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_subseq_forward@__recurrent_group_2__.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "_rnn_subseq_forward@__recurrent_group_2__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 400
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
-  size: 300
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
-  size: 30000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 300
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
-  size: 300
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 300
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___fc_layer_0__@__recurrent_group_3__.w0"
-  size: 20000
-  initial_mean: 0.0
-  initial_std: 0.1
-  dims: 100
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__@__recurrent_group_3__.w1"
-  size: 40000
-  initial_mean: 0.0
-  initial_std: 0.0707106781187
-  dims: 200
-  dims: 200
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___fc_layer_0__@__recurrent_group_3__.wbias"
-  size: 200
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 200
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "seq_input"
-input_layer_names: "sub_seq_input"
-output_layer_names: "__last_seq_0__"
-output_layer_names: "__first_seq_0__"
-output_layer_names: "__last_seq_1__"
-output_layer_names: "__last_seq_2__"
-output_layer_names: "__last_seq_3__"
-output_layer_names: "__last_seq_4__"
-sub_models {
-  name: "root"
-  layer_names: "seq_input"
-  layer_names: "sub_seq_input"
-  layer_names: "label"
-  layer_names: "__mixed_0__"
-  layer_names: "__mixed_1__"
-  layer_names: "__recurrent_group_0__"
-  layer_names: "rnn_forward"
-  layer_names: "__last_seq_0__"
-  layer_names: "__recurrent_group_1__"
-  layer_names: "rnn_back"
-  layer_names: "__first_seq_0__"
-  layer_names: "__recurrent_group_2__"
-  layer_names: "rnn_subseq_forward"
-  layer_names: "__last_seq_1__"
-  layer_names: "__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__"
-  layer_names: "__last_seq_2__"
-  layer_names: "__gru_group_0___recurrent_group"
-  layer_names: "__gru_group_0__"
-  layer_names: "__last_seq_3__"
-  layer_names: "__recurrent_group_3__"
-  layer_names: "__fc_layer_0__"
-  layer_names: "__last_seq_4__"
-  input_layer_names: "seq_input"
-  input_layer_names: "sub_seq_input"
-  output_layer_names: "__last_seq_0__"
-  output_layer_names: "__first_seq_0__"
-  output_layer_names: "__last_seq_1__"
-  output_layer_names: "__last_seq_2__"
-  output_layer_names: "__last_seq_3__"
-  output_layer_names: "__last_seq_4__"
-  is_recurrent_layer_group: false
-}
-sub_models {
-  name: "__recurrent_group_0__"
-  layer_names: "seq_input@__recurrent_group_0__"
-  layer_names: "rnn_forward+delay1@__recurrent_group_0__"
-  layer_names: "rnn_forward@__recurrent_group_0__"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "rnn_forward@__recurrent_group_0__"
-    link_name: "rnn_forward+delay1@__recurrent_group_0__"
-  }
-  in_links {
-    layer_name: "seq_input"
-    link_name: "seq_input@__recurrent_group_0__"
-  }
-  out_links {
-    layer_name: "rnn_forward@__recurrent_group_0__"
-    link_name: "rnn_forward"
-  }
-}
-sub_models {
-  name: "__recurrent_group_1__"
-  layer_names: "seq_input@__recurrent_group_1__"
-  layer_names: "rnn_back+delay1@__recurrent_group_1__"
-  layer_names: "rnn_back@__recurrent_group_1__"
-  is_recurrent_layer_group: true
-  reversed: true
-  memories {
-    layer_name: "rnn_back@__recurrent_group_1__"
-    link_name: "rnn_back+delay1@__recurrent_group_1__"
-  }
-  in_links {
-    layer_name: "seq_input"
-    link_name: "seq_input@__recurrent_group_1__"
-  }
-  out_links {
-    layer_name: "rnn_back@__recurrent_group_1__"
-    link_name: "rnn_back"
-  }
-}
-sub_models {
-  name: "__recurrent_group_2__"
-  layer_names: "sub_seq_input@__recurrent_group_2__"
-  layer_names: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-  layer_names: "rnn_subseq_forward@__recurrent_group_2__"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
-    link_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-  }
-  in_links {
-    layer_name: "sub_seq_input"
-    link_name: "sub_seq_input@__recurrent_group_2__"
-  }
-  out_links {
-    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
-    link_name: "rnn_subseq_forward"
-  }
-}
-sub_models {
-  name: "__lstm_group_0___recurrent_group"
-  layer_names: "__mixed_0__@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-  layer_names: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-  }
-  memories {
-    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-  }
-  in_links {
-    layer_name: "__mixed_0__"
-    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-  }
-  out_links {
-    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
-    link_name: "__lstm_group_0__"
-  }
-}
-sub_models {
-  name: "__gru_group_0___recurrent_group"
-  layer_names: "__mixed_1__@__gru_group_0___recurrent_group"
-  layer_names: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-  layer_names: "__gru_group_0__@__gru_group_0___recurrent_group"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
-    link_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-  }
-  in_links {
-    layer_name: "__mixed_1__"
-    link_name: "__mixed_1__@__gru_group_0___recurrent_group"
-  }
-  out_links {
-    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
-    link_name: "__gru_group_0__"
-  }
-}
-sub_models {
-  name: "__recurrent_group_3__"
-  layer_names: "seq_input@__recurrent_group_3__"
-  layer_names: "__memory_6__@__recurrent_group_3__"
-  layer_names: "__fc_layer_0__@__recurrent_group_3__"
-  is_recurrent_layer_group: true
-  reversed: false
-  memories {
-    layer_name: "__fc_layer_0__@__recurrent_group_3__"
-    link_name: "__memory_6__@__recurrent_group_3__"
-  }
-  in_links {
-    layer_name: "seq_input"
-    link_name: "seq_input@__recurrent_group_3__"
-  }
-  out_links {
-    layer_name: "__fc_layer_0__@__recurrent_group_3__"
-    link_name: "__fc_layer_0__"
-  }
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
deleted file mode 100644
index 0ec88aa99..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
+++ /dev/null
@@ -1,100 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 588
-  active_type: ""
-  height: 14
-  width: 14
-}
-layers {
-  name: "rois"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__conv_0__"
-  type: "exconv"
-  size: 3136
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___conv_0__.w0"
-    conv_conf {
-      filter_size: 3
-      channels: 3
-      stride: 1
-      padding: 1
-      groups: 1
-      filter_channels: 3
-      output_x: 14
-      img_size: 14
-      caffe_mode: true
-      filter_size_y: 3
-      padding_y: 1
-      stride_y: 1
-      output_y: 14
-      img_size_y: 14
-      dilation: 1
-      dilation_y: 1
-    }
-  }
-  bias_parameter_name: "___conv_0__.wbias"
-  num_filters: 16
-  shared_biases: true
-  height: 14
-  width: 14
-}
-layers {
-  name: "__roi_pool_0__"
-  type: "roi_pool"
-  size: 784
-  active_type: ""
-  inputs {
-    input_layer_name: "__conv_0__"
-    roi_pool_conf {
-      pooled_width: 7
-      pooled_height: 7
-      spatial_scale: 0.0625
-    }
-  }
-  inputs {
-    input_layer_name: "rois"
-  }
-  height: 7
-  width: 7
-}
-parameters {
-  name: "___conv_0__.w0"
-  size: 432
-  initial_mean: 0.0
-  initial_std: 0.272165526976
-  initial_strategy: 0
-  initial_smart: false
-}
-parameters {
-  name: "___conv_0__.wbias"
-  size: 16
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 16
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-input_layer_names: "rois"
-output_layer_names: "__roi_pool_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "rois"
-  layer_names: "__conv_0__"
-  layer_names: "__roi_pool_0__"
-  input_layer_names: "data"
-  input_layer_names: "rois"
-  output_layer_names: "__roi_pool_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
deleted file mode 100644
index 19c9f1657..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
+++ /dev/null
@@ -1,41 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2560
-  active_type: ""
-}
-layers {
-  name: "__row_conv_layer_0__"
-  type: "row_conv"
-  size: 2560
-  active_type: "relu"
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___row_conv_layer_0__.w0"
-    row_conv_conf {
-      context_length: 19
-    }
-  }
-}
-parameters {
-  name: "___row_conv_layer_0__.w0"
-  size: 48640
-  initial_mean: 0.0
-  initial_std: 0.229415733871
-  dims: 19
-  dims: 2560
-  initial_strategy: 0
-  initial_smart: true
-}
-input_layer_names: "data"
-output_layer_names: "__row_conv_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__row_conv_layer_0__"
-  input_layer_names: "data"
-  output_layer_names: "__row_conv_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
deleted file mode 100644
index c2786ff55..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
+++ /dev/null
@@ -1,27 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__row_l2_norm_layer_0__"
-  type: "row_l2_norm"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-}
-input_layer_names: "input"
-output_layer_names: "__row_l2_norm_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "__row_l2_norm_layer_0__"
-  input_layer_names: "input"
-  output_layer_names: "__row_l2_norm_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
deleted file mode 100644
index 35ade126a..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
+++ /dev/null
@@ -1,72 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__scale_shift_0__"
-  type: "scale_shift"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___scale_shift_0__.w0"
-  }
-}
-layers {
-  name: "__scale_shift_1__"
-  type: "scale_shift"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    input_parameter_name: "___scale_shift_1__.w0"
-  }
-  bias_parameter_name: "___scale_shift_1__.wbias"
-}
-parameters {
-  name: "___scale_shift_0__.w0"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___scale_shift_1__.w0"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 1.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: true
-}
-parameters {
-  name: "___scale_shift_1__.wbias"
-  size: 1
-  initial_mean: 0.0
-  initial_std: 0.0
-  dims: 1
-  dims: 1
-  initial_strategy: 0
-  initial_smart: false
-}
-input_layer_names: "data"
-output_layer_names: "__scale_shift_0__"
-output_layer_names: "__scale_shift_1__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__scale_shift_0__"
-  layer_names: "__scale_shift_1__"
-  input_layer_names: "data"
-  output_layer_names: "__scale_shift_0__"
-  output_layer_names: "__scale_shift_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
deleted file mode 100644
index d20133a10..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
+++ /dev/null
@@ -1,51 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 2016
-  active_type: ""
-  height: 48
-  width: 42
-}
-layers {
-  name: "indices"
-  type: "data"
-  size: 6
-  active_type: ""
-}
-layers {
-  name: "__scale_sub_region_0__"
-  type: "scale_sub_region"
-  size: 2016
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    scale_sub_region_conf {
-      image_conf {
-        channels: 1
-        img_size: 42
-        img_size_y: 48
-      }
-      value: 0.0
-    }
-  }
-  inputs {
-    input_layer_name: "indices"
-  }
-  height: 48
-  width: 42
-}
-input_layer_names: "data"
-input_layer_names: "indices"
-output_layer_names: "__scale_sub_region_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "indices"
-  layer_names: "__scale_sub_region_0__"
-  input_layer_names: "data"
-  input_layer_names: "indices"
-  output_layer_names: "__scale_sub_region_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
deleted file mode 100644
index 9d1b41c9d..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
+++ /dev/null
@@ -1,51 +0,0 @@
-type: "nn"
-layers {
-  name: "data1"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "data2"
-  type: "data"
-  size: 30
-  active_type: ""
-}
-layers {
-  name: "__seqconcat_0__"
-  type: "seqconcat"
-  size: 30
-  active_type: ""
-  inputs {
-    input_layer_name: "data1"
-  }
-  inputs {
-    input_layer_name: "data2"
-  }
-}
-layers {
-  name: "__seqreshape_0__"
-  type: "seqreshape"
-  size: 5
-  active_type: ""
-  inputs {
-    input_layer_name: "data1"
-  }
-}
-input_layer_names: "data1"
-input_layer_names: "data2"
-output_layer_names: "__seqconcat_0__"
-output_layer_names: "__seqreshape_0__"
-sub_models {
-  name: "root"
-  layer_names: "data1"
-  layer_names: "data2"
-  layer_names: "__seqconcat_0__"
-  layer_names: "__seqreshape_0__"
-  input_layer_names: "data1"
-  input_layer_names: "data2"
-  output_layer_names: "__seqconcat_0__"
-  output_layer_names: "__seqreshape_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
deleted file mode 100644
index 5b73d614f..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
+++ /dev/null
@@ -1,79 +0,0 @@
-type: "nn"
-layers {
-  name: "word"
-  type: "data"
-  size: 128
-  active_type: ""
-}
-layers {
-  name: "starts"
-  type: "data"
-  size: 5
-  active_type: ""
-}
-layers {
-  name: "ends"
-  type: "data"
-  size: 5
-  active_type: ""
-}
-layers {
-  name: "__seq_slice_layer_0__"
-  type: "seq_slice"
-  size: 128
-  active_type: ""
-  inputs {
-    input_layer_name: "word"
-  }
-  inputs {
-    input_layer_name: "starts"
-  }
-  inputs {
-    input_layer_name: "ends"
-  }
-}
-layers {
-  name: "__seq_slice_layer_1__"
-  type: "seq_slice"
-  size: 128
-  active_type: ""
-  inputs {
-    input_layer_name: "word"
-  }
-  inputs {
-    input_layer_name: "starts"
-  }
-  select_first: true
-}
-layers {
-  name: "__seq_slice_layer_2__"
-  type: "seq_slice"
-  size: 128
-  active_type: ""
-  inputs {
-    input_layer_name: "word"
-  }
-  inputs {
-    input_layer_name: "ends"
-  }
-  select_first: false
-}
-input_layer_names: "word"
-output_layer_names: "__seq_slice_layer_0__"
-output_layer_names: "__seq_slice_layer_1__"
-output_layer_names: "__seq_slice_layer_2__"
-sub_models {
-  name: "root"
-  layer_names: "word"
-  layer_names: "starts"
-  layer_names: "ends"
-  layer_names: "__seq_slice_layer_0__"
-  layer_names: "__seq_slice_layer_1__"
-  layer_names: "__seq_slice_layer_2__"
-  input_layer_names: "word"
-  output_layer_names: "__seq_slice_layer_0__"
-  output_layer_names: "__seq_slice_layer_1__"
-  output_layer_names: "__seq_slice_layer_2__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
deleted file mode 100644
index 8989561df..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ /dev/null
@@ -1,162 +0,0 @@
-type: "nn"
-layers {
-  name: "dat_in"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__seq_pooling_0__"
-  type: "max"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_1__"
-  type: "max"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_2__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "average"
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_3__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "average"
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_4__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "sum"
-  trans_type: "seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_5__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "sum"
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-layers {
-  name: "__seq_pooling_6__"
-  type: "max"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-layers {
-  name: "__seq_pooling_7__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "average"
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-layers {
-  name: "__seq_pooling_8__"
-  type: "average"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  average_strategy: "sum"
-  trans_type: "non-seq"
-  seq_pool_stride: 5
-}
-layers {
-  name: "__seq_pooling_9__"
-  type: "max"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "dat_in"
-  }
-  output_max_index: true
-  trans_type: "non-seq"
-  seq_pool_stride: -1
-}
-input_layer_names: "dat_in"
-output_layer_names: "__seq_pooling_0__"
-output_layer_names: "__seq_pooling_1__"
-output_layer_names: "__seq_pooling_2__"
-output_layer_names: "__seq_pooling_3__"
-output_layer_names: "__seq_pooling_4__"
-output_layer_names: "__seq_pooling_5__"
-output_layer_names: "__seq_pooling_6__"
-output_layer_names: "__seq_pooling_7__"
-output_layer_names: "__seq_pooling_8__"
-output_layer_names: "__seq_pooling_9__"
-sub_models {
-  name: "root"
-  layer_names: "dat_in"
-  layer_names: "__seq_pooling_0__"
-  layer_names: "__seq_pooling_1__"
-  layer_names: "__seq_pooling_2__"
-  layer_names: "__seq_pooling_3__"
-  layer_names: "__seq_pooling_4__"
-  layer_names: "__seq_pooling_5__"
-  layer_names: "__seq_pooling_6__"
-  layer_names: "__seq_pooling_7__"
-  layer_names: "__seq_pooling_8__"
-  layer_names: "__seq_pooling_9__"
-  input_layer_names: "dat_in"
-  output_layer_names: "__seq_pooling_0__"
-  output_layer_names: "__seq_pooling_1__"
-  output_layer_names: "__seq_pooling_2__"
-  output_layer_names: "__seq_pooling_3__"
-  output_layer_names: "__seq_pooling_4__"
-  output_layer_names: "__seq_pooling_5__"
-  output_layer_names: "__seq_pooling_6__"
-  output_layer_names: "__seq_pooling_7__"
-  output_layer_names: "__seq_pooling_8__"
-  output_layer_names: "__seq_pooling_9__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
deleted file mode 100644
index 4aa041ea2..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
+++ /dev/null
@@ -1,40 +0,0 @@
-type: "nn"
-layers {
-  name: "input"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "label"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "__smooth_l1_cost_0__"
-  type: "smooth_l1"
-  size: 1
-  active_type: ""
-  inputs {
-    input_layer_name: "input"
-  }
-  inputs {
-    input_layer_name: "label"
-  }
-  coeff: 1.0
-}
-input_layer_names: "input"
-input_layer_names: "label"
-output_layer_names: "__smooth_l1_cost_0__"
-sub_models {
-  name: "root"
-  layer_names: "input"
-  layer_names: "label"
-  layer_names: "__smooth_l1_cost_0__"
-  input_layer_names: "input"
-  input_layer_names: "label"
-  output_layer_names: "__smooth_l1_cost_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
deleted file mode 100644
index 569b0b945..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
+++ /dev/null
@@ -1,72 +0,0 @@
-model_config {
-  type: "nn"
-  layers {
-    name: "a"
-    type: "data"
-    size: 10
-    active_type: ""
-  }
-  input_layer_names: "a"
-  output_layer_names: "a"
-  sub_models {
-    name: "root"
-    layer_names: "a"
-    input_layer_names: "a"
-    output_layer_names: "a"
-    is_recurrent_layer_group: false
-  }
-}
-data_config {
-  type: "py2"
-  files: "train.list"
-  async_load_data: false
-  for_test: false
-  load_data_module: "a"
-  load_data_object: "c"
-  load_data_args: ""
-  data_ratio: 1
-  is_main_data: true
-  usage_ratio: 1.0
-}
-opt_config {
-  batch_size: 1000
-  algorithm: "sgd"
-  learning_rate: 0.001
-  learning_rate_decay_a: 0.0
-  learning_rate_decay_b: 0.0
-  l1weight: 0.1
-  l2weight: 0.0
-  c1: 0.0001
-  backoff: 0.5
-  owlqn_steps: 10
-  max_backoff: 5
-  l2weight_zero_iter: 0
-  average_window: 0
-  learning_method: "momentum"
-  ada_epsilon: 1e-06
-  do_average_in_cpu: false
-  ada_rou: 0.95
-  learning_rate_schedule: "poly"
-  delta_add_rate: 1.0
-  shrink_parameter_value: 0
-  adam_beta1: 0.9
-  adam_beta2: 0.999
-  adam_epsilon: 1e-08
-  learning_rate_args: ""
-  async_lagged_grad_discard_ratio: 1.5
-}
-test_data_config {
-  type: "py2"
-  files: "test.list"
-  async_load_data: false
-  for_test: true
-  load_data_module: "b"
-  load_data_object: "d"
-  load_data_args: ""
-  data_ratio: 1
-  is_main_data: true
-  usage_ratio: 1.0
-}
-save_dir: "./output/model"
-start_pass: 0
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
deleted file mode 100644
index ca1b2d8cf..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_spp_layer.protostr
+++ /dev/null
@@ -1,40 +0,0 @@
-type: "nn"
-layers {
-  name: "data"
-  type: "data"
-  size: 3200
-  active_type: ""
-  height: 20
-  width: 10
-}
-layers {
-  name: "__spp_0__"
-  type: "spp"
-  size: 80
-  active_type: ""
-  inputs {
-    input_layer_name: "data"
-    spp_conf {
-      image_conf {
-        channels: 16
-        img_size: 10
-        img_size_y: 20
-      }
-      pool_type: "max-projection"
-      pyramid_height: 2
-    }
-  }
-  height: 1
-  width: 5
-}
-input_layer_names: "data"
-output_layer_names: "__spp_0__"
-sub_models {
-  name: "root"
-  layer_names: "data"
-  layer_names: "__spp_0__"
-  input_layer_names: "data"
-  output_layer_names: "__spp_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
deleted file mode 100644
index 4b906b113..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
+++ /dev/null
@@ -1,37 +0,0 @@
-type: "nn"
-layers {
-  name: "input_seq"
-  type: "data"
-  size: 300
-  active_type: ""
-}
-layers {
-  name: "input"
-  type: "data"
-  size: 5
-  active_type: ""
-}
-layers {
-  name: "__sub_nested_seq_layer_0__"
-  type: "sub_nested_seq"
-  size: 300
-  active_type: ""
-  inputs {
-    input_layer_name: "input_seq"
-  }
-  inputs {
-    input_layer_name: "input"
-  }
-}
-input_layer_names: "input_seq"
-output_layer_names: "__sub_nested_seq_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "input_seq"
-  layer_names: "input"
-  layer_names: "__sub_nested_seq_layer_0__"
-  input_layer_names: "input_seq"
-  output_layer_names: "__sub_nested_seq_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
deleted file mode 100644
index 89ed28406..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
+++ /dev/null
@@ -1,27 +0,0 @@
-type: "nn"
-layers {
-  name: "probs"
-  type: "data"
-  size: 100
-  active_type: ""
-}
-layers {
-  name: "__sampling_id_layer_0__"
-  type: "sampling_id"
-  size: 100
-  active_type: ""
-  inputs {
-    input_layer_name: "probs"
-  }
-}
-input_layer_names: "probs"
-output_layer_names: "__sampling_id_layer_0__"
-sub_models {
-  name: "root"
-  layer_names: "probs"
-  layer_names: "__sampling_id_layer_0__"
-  input_layer_names: "probs"
-  output_layer_names: "__sampling_id_layer_0__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
deleted file mode 100644
index 7a2f3eab3..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
+++ /dev/null
@@ -1,87 +0,0 @@
-type: "nn"
-layers {
-  name: "a"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "b"
-  type: "data"
-  size: 10
-  active_type: ""
-}
-layers {
-  name: "__addto_0__"
-  type: "addto"
-  size: 10
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__concat_0__"
-  type: "concat"
-  size: 20
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-  }
-  inputs {
-    input_layer_name: "b"
-  }
-  height: 0
-  width: 0
-  depth: 1
-}
-layers {
-  name: "__concat_1__"
-  type: "concat2"
-  size: 20
-  active_type: ""
-  inputs {
-    input_layer_name: "a"
-    proj_conf {
-      type: "identity"
-      name: "___concat_1__.w0"
-      input_size: 10
-      output_size: 10
-    }
-  }
-  inputs {
-    input_layer_name: "b"
-    proj_conf {
-      type: "identity"
-      name: "___concat_1__.w1"
-      input_size: 10
-      output_size: 10
-    }
-  }
-}
-input_layer_names: "a"
-input_layer_names: "b"
-output_layer_names: "__addto_0__"
-output_layer_names: "__concat_0__"
-output_layer_names: "__concat_1__"
-sub_models {
-  name: "root"
-  layer_names: "a"
-  layer_names: "b"
-  layer_names: "__addto_0__"
-  layer_names: "__concat_0__"
-  layer_names: "__concat_1__"
-  input_layer_names: "a"
-  input_layer_names: "b"
-  output_layer_names: "__addto_0__"
-  output_layer_names: "__concat_0__"
-  output_layer_names: "__concat_1__"
-  is_recurrent_layer_group: false
-}
-
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
deleted file mode 100755
index c8a3b190b..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-
-set -e
-PYTHON_EXEC=$1
-COMPARE_PROTO_UTIL=$2
-
-protostr=`dirname $0`/protostr
-
-files=`ls $protostr | grep -v "unittest"`
-
-./generate_protostr.sh ${PYTHON_EXEC}
-
-. ./file_list.sh
-
-if [ -z ${COMPARE_PROTO_UTIL} ]; then
-  for file in $files
-  do
-      base_protostr=$protostr/$file
-      new_protostr=$protostr/$file.unittest
-      diff $base_protostr $new_protostr -u
-      diff $protostr/$file $protostr/$file.non_file_config.unittest -u
-  done
-else
-  for file in ${configs[*]}
-  do
-    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest; then
-      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
-    fi
-    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
-      diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
-    fi
-  done
-
-  for file in ${whole_configs[*]}
-  do
-    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
-      diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
-    fi
-    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
-      diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
-    fi
-  done
-fi
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
deleted file mode 100644
index 3229252a2..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-a = data_layer(name='feature_a', size=200)
-b = data_layer(name='feature_b', size=200)
-
-fc_param = ParamAttr(name='fc_param', initial_max=1.0, initial_min=-1.0)
-bias_param = ParamAttr(name='bias_param', initial_mean=0.0, initial_std=0.0)
-
-softmax_param = ParamAttr(
-    name='softmax_param', initial_max=1.0, initial_min=-1.0)
-
-hidden_a = fc_layer(
-    input=a, size=200, param_attr=fc_param, bias_attr=bias_param)
-hidden_b = fc_layer(
-    input=b, size=200, param_attr=fc_param, bias_attr=bias_param)
-
-predict = fc_layer(
-    input=[hidden_a, hidden_b],
-    param_attr=[softmax_param, softmax_param],
-    bias_attr=False,
-    size=10,
-    act=SoftmaxActivation())
-
-outputs(
-    classification_cost(
-        input=predict, label=data_layer(
-            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
deleted file mode 100644
index dff561fdf..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-data_1 = data_layer(name='data_a', size=100)
-data_2 = data_layer(name='data_b', size=100)
-
-mixed_param = ParamAttr(name='mixed_param')
-
-gru_param = ParamAttr(name='gru_param')
-gru_bias = ParamAttr(name='gru_bias', initial_mean=0., initial_std=0.)
-
-gru1 = simple_gru(
-    input=data_1,
-    size=200,
-    mixed_param_attr=mixed_param,
-    mixed_bias_param_attr=False,
-    gru_bias_attr=gru_bias,
-    gru_param_attr=gru_param)
-
-gru2 = simple_gru(
-    input=data_2,
-    size=200,
-    mixed_param_attr=mixed_param,
-    mixed_bias_param_attr=False,
-    gru_bias_attr=gru_bias,
-    gru_param_attr=gru_param)
-
-softmax_param = ParamAttr(name='softmax_param')
-
-predict = fc_layer(
-    input=[last_seq(input=gru1), last_seq(input=gru2)],
-    size=10,
-    param_attr=[softmax_param, softmax_param],
-    bias_attr=False,
-    act=SoftmaxActivation())
-outputs(
-    classification_cost(
-        input=predict, label=data_layer(
-            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
deleted file mode 100644
index 97ef2d07a..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-data_1 = data_layer(name='data_a', size=100)
-data_2 = data_layer(name='data_b', size=100)
-
-mixed_param = ParamAttr(name='mixed_param')
-
-with mixed_layer(size=400, bias_attr=False) as m1:
-    m1 += full_matrix_projection(input=data_1, param_attr=mixed_param)
-
-with mixed_layer(size=400, bias_attr=False) as m2:
-    m2 += full_matrix_projection(input=data_2, param_attr=mixed_param)
-
-lstm_param = ParamAttr(name='lstm_param')
-lstm_bias = ParamAttr(name='lstm_bias', initial_mean=0., initial_std=0.)
-
-lstm1 = lstmemory_group(
-    input=m1,
-    param_attr=lstm_param,
-    lstm_bias_attr=lstm_bias,
-    input_proj_bias_attr=False)
-
-lstm2 = lstmemory_group(
-    input=m2,
-    param_attr=lstm_param,
-    lstm_bias_attr=lstm_bias,
-    input_proj_bias_attr=False)
-
-softmax_param = ParamAttr(name='softmax_param')
-
-predict = fc_layer(
-    input=[last_seq(input=lstm1), last_seq(input=lstm2)],
-    size=10,
-    param_attr=[softmax_param, softmax_param],
-    bias_attr=False,
-    act=SoftmaxActivation())
-outputs(
-    classification_cost(
-        input=predict, label=data_layer(
-            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
deleted file mode 100644
index f882efcba..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-din = data_layer(name='data', size=200)
-
-hidden = fc_layer(input=din, size=200, act=SigmoidActivation())
-
-rnn = recurrent_layer(input=hidden, act=SigmoidActivation())
-
-rnn2 = recurrent_layer(input=hidden, act=SigmoidActivation(), reverse=True)
-
-lstm1_param = fc_layer(
-    input=hidden, size=200 * 4, act=LinearActivation(), bias_attr=False)
-
-lstm1 = lstmemory(input=lstm1_param, act=SigmoidActivation())
-
-lstm2_param = fc_layer(
-    input=hidden, size=200 * 4, act=LinearActivation(), bias_attr=False)
-
-lstm2 = lstmemory(input=lstm2_param, act=SigmoidActivation(), reverse=True)
-
-gru1_param = fc_layer(
-    input=hidden, size=200 * 3, act=LinearActivation(), bias_attr=False)
-gru1 = grumemory(input=gru1_param, act=SigmoidActivation())
-
-gru2_param = fc_layer(
-    input=hidden, size=200 * 3, act=LinearActivation(), bias_attr=False)
-gru2 = grumemory(input=gru2_param, act=SigmoidActivation(), reverse=True)
-
-outputs(
-    last_seq(input=rnn),
-    first_seq(input=rnn2),
-    last_seq(input=lstm1),
-    first_seq(input=lstm2),
-    last_seq(input=gru1),
-    first_seq(gru2))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
deleted file mode 100644
index 169038deb..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-#data = data_layer(name='data', size=180, width=30, height=6)
-#batchNorm = batch_norm_layer(data, num_channels=1)
-#outputs(batchNorm)
-
-data3D = data_layer(name='data3D', size=120 * 3, width=20, height=6, depth=3)
-batchNorm3D = batch_norm_layer(data3D, num_channels=1, img3D=True)
-outputs(batchNorm3D)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
deleted file mode 100644
index d29e4e5c4..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-din = data_layer(name='data', size=120)
-
-outputs(bidirectional_gru(input=din, size=40, return_seq=True))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
deleted file mode 100644
index 5e724ba7d..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2304)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=1,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-bilinear = bilinear_interp_layer(input=conv, out_size_x=64, out_size_y=64)
-
-pool = img_pool_layer(
-    input=bilinear,
-    num_channels=16,
-    pool_size=2,
-    stride=2,
-    pool_type=MaxPooling())
-
-fc = fc_layer(input=pool, size=384, bias_attr=False)
-
-outputs(fc)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
deleted file mode 100644
index 95a1192bf..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300)
-clip = clip_layer(input=data, min=-10, max=10)
-
-outputs(clip)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
deleted file mode 100644
index 9b791a022..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import re
-import getopt
-
-
-def main(print_whole_config, globals, locals):
-    '''
-     this test will all test_config.py
-  '''
-    cmdstr = """from paddle.trainer.config_parser import parse_config\n"""
-    importstr = ""
-    functionstr = ""
-
-    for line in sys.stdin:
-        if re.match("^import", line) or re.match("^from.*import", line):
-            importstr = importstr + line
-        else:
-            functionstr = functionstr + "  " + line
-
-    cmdstr = cmdstr + importstr + """def configs():\n""" + functionstr
-    #cmdstr = cmdstr + """def configs():\n""" + importstr + functionstr
-    if print_whole_config:
-        cmdstr = cmdstr + """print parse_config(configs, "")"""
-    else:
-        cmdstr = cmdstr + """print parse_config(configs, "").model_config"""
-
-    exec (cmdstr, globals, locals)
-
-
-if __name__ == '__main__':
-    whole = False
-    opts, args = getopt.getopt(sys.argv[1:], "", ["whole"])
-    for op, value in opts:
-        if op == "--whole":
-            whole = True
-    main(whole, globals(), locals())
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
deleted file mode 100644
index f9966e399..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
+++ /dev/null
@@ -1,63 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-num_channels = 3
-filter_size = 3
-filter_size_y = 3
-filter_size_z = 3
-stride = 2
-stride_y = 2
-stride_z = 2
-padding = 1
-padding_y = 1
-padding_z = 1
-groups = 1
-
-data = data_layer(
-    name='data', size=12096 * num_channels, height=48, width=42, depth=6)
-# first
-conv3d_1 = img_conv3d_layer(
-    input=data,
-    name='conv3d_1',
-    num_filters=16,
-    num_channels=num_channels,
-    filter_size=filter_size,
-    stride=stride,
-    padding=padding,
-    groups=groups,
-    bias_attr=True,
-    shared_biases=True,
-    trans=False,
-    layer_type="conv3d",
-    act=LinearActivation())
-# second
-conv3d_2 = img_conv3d_layer(
-    input=data,
-    name='conv3d_2',
-    num_filters=16,
-    num_channels=num_channels,
-    filter_size=[filter_size, filter_size_y, filter_size_z],
-    stride=[stride, stride_y, stride_z],
-    padding=[padding, padding_y, padding_z],
-    groups=groups,
-    bias_attr=True,
-    shared_biases=True,
-    trans=False,
-    layer_type="conv3d",
-    act=LinearActivation())
-outputs(conv3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
deleted file mode 100644
index 351694fd5..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-seq_in = data_layer(name='input', size=200)
-labels = data_layer(name='labels', size=5000)
-
-probs = data_layer(name='probs', size=10)
-xe_label = data_layer(name='xe-label', size=10)
-
-hidden = fc_layer(input=seq_in, size=4)
-outputs(
-    ctc_layer(
-        input=seq_in, label=labels),
-    warp_ctc_layer(
-        input=seq_in, label=labels, blank=0),
-    crf_layer(
-        input=hidden, label=data_layer(
-            name='crf_label', size=4)),
-    rank_cost(
-        left=data_layer(
-            name='left', size=1),
-        right=data_layer(
-            name='right', size=1),
-        label=data_layer(
-            name='label', size=1)),
-    lambda_cost(
-        input=data_layer(
-            name='list_feature', size=100),
-        score=data_layer(
-            name='list_scores', size=1)),
-    cross_entropy(
-        input=probs, label=xe_label),
-    cross_entropy_with_selfnorm(
-        input=probs, label=xe_label),
-    huber_regression_cost(
-        input=seq_in, label=labels),
-    huber_classification_cost(
-        input=data_layer(
-            name='huber_probs', size=1),
-        label=data_layer(
-            name='huber_label', size=1)),
-    multi_binary_label_cross_entropy(
-        input=probs, label=xe_label),
-    sum_cost(input=hidden),
-    nce_layer(
-        input=hidden, label=labels))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
deleted file mode 100644
index 8cbcf5de0..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ /dev/null
@@ -1,33 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-data = data_layer(name='input', size=300)
-lbl = data_layer(name='label', size=1)
-wt = data_layer(name='weight', size=1)
-fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
-
-outputs(
-    classification_cost(
-        input=fc, label=lbl, weight=wt),
-    square_error_cost(
-        input=fc, label=lbl, weight=wt),
-    nce_layer(
-        input=fc,
-        label=data_layer(
-            name='multi_class_label', size=500),
-        weight=wt))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
deleted file mode 100644
index b4ffff252..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2016, height=48, width=42)
-refernce_data = data_layer(name='data', size=768, height=16, width=16)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=1,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
-
-crop = crop_layer(input=[pool, refernce_data], axis=2)
-
-outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
deleted file mode 100644
index 4a5bdf118..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-#coding=utf-8
-
-from paddle.trainer_config_helpers import *
-beam_size = 5
-
-# the first beam expansion.
-sentence_states = data_layer(name="sentence_states", size=32)
-sentence_scores = data_layer(name="sentence_scores", size=1)
-topk_sentence_ids = kmax_seq_score_layer(
-    input=sentence_scores, beam_size=beam_size)
-
-# the second beam expansion.
-topk_sen = sub_nested_seq_layer(
-    input=sentence_states, selected_indices=topk_sentence_ids)
-start_pos_scores = fc_layer(input=topk_sen, size=1, act=LinearActivation())
-topk_start_pos_ids = kmax_seq_score_layer(
-    input=sentence_scores, beam_size=beam_size)
-
-# the final beam expansion.
-topk_start_spans = seq_slice_layer(
-    input=topk_sen, starts=topk_start_pos_ids, ends=None)
-end_pos_scores = fc_layer(
-    input=topk_start_spans, size=1, act=LinearActivation())
-topk_end_pos_ids = kmax_seq_score_layer(
-    input=end_pos_scores, beam_size=beam_size)
-
-# define the cost
-sentence_idx = data_layer(name="sentences_ids", size=1)
-start_idx = data_layer(name="start_ids", size=1)
-end_idx = data_layer(name="end_ids", size=1)
-cost = cross_entropy_over_beam(input=[
-    BeamInput(
-        candidate_scores=sentence_scores,
-        selected_candidates=topk_sentence_ids,
-        gold=sentence_idx), BeamInput(
-            candidate_scores=start_pos_scores,
-            selected_candidates=topk_start_pos_ids,
-            gold=start_idx), BeamInput(
-                candidate_scores=end_pos_scores,
-                selected_candidates=topk_end_pos_ids,
-                gold=end_idx)
-])
-
-outputs(cost)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
deleted file mode 100644
index 08e701c7a..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-num_channels = 3
-filter_size = 3
-filter_size_y = 3
-filter_size_z = 3
-stride = 2
-stride_y = 2
-stride_z = 2
-padding = 1
-padding_y = 1
-padding_z = 1
-groups = 1
-
-data = data_layer(
-    name='data', size=12096 * num_channels, height=48, width=42, depth=6)
-
-# first
-deconv3d_1 = img_conv3d_layer(
-    input=data,
-    name='deconv3d_1',
-    num_filters=16,
-    num_channels=num_channels,
-    filter_size=filter_size,
-    stride=stride,
-    padding=padding,
-    groups=groups,
-    bias_attr=True,
-    shared_biases=True,
-    trans=True,
-    layer_type="deconv3d",
-    act=LinearActivation())
-# second
-deconv3d_2 = img_conv3d_layer(
-    input=data,
-    name='deconv3d_2',
-    num_filters=16,
-    num_channels=num_channels,
-    filter_size=[filter_size, filter_size_y, filter_size_z],
-    stride=[stride, stride_y, stride_z],
-    padding=[padding, padding_y, padding_z],
-    groups=groups,
-    bias_attr=True,
-    shared_biases=True,
-    trans=True,
-    layer_type="deconv3d",
-    act=LinearActivation())
-outputs(deconv3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
deleted file mode 100644
index 4ecd1c2b7..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
-
-input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
-
-priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
-
-detout = detection_output_layer(
-    input_loc=input_loc,
-    input_conf=input_conf,
-    priorbox=priorbox,
-    num_classes=21,
-    nms_threshold=0.45,
-    nms_top_k=400,
-    keep_top_k=200,
-    confidence_threshold=0.01,
-    background_id=0,
-    name='test_detection_output')
-
-outputs(detout)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
deleted file mode 100644
index 9b444bc2c..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-vec1 = data_layer(name='vector1', size=10)
-vec2 = data_layer(name='vector2', size=10)
-dot_product = dot_prod_layer(input1=vec1, input2=vec2)
-
-outputs(dot_product)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
deleted file mode 100644
index 85101d2b9..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ /dev/null
@@ -1,28 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=30)
-data_seq = data_layer(name='data_seq', size=30)
-
-outputs(
-    expand_layer(
-        input=din, expand_as=data_seq, expand_level=ExpandLevel.FROM_SEQUENCE),
-    expand_layer(
-        input=din,
-        expand_as=data_seq,
-        expand_level=ExpandLevel.FROM_NO_SEQUENCE))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
deleted file mode 100644
index 48ac46c5b..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='data', size=1024)
-
-fm = factorization_machine(input=data, factor_size=10)
-
-outputs(fm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
deleted file mode 100644
index f1e454d21..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=100)
-
-trans = trans_layer(input=din)
-
-hidden = fc_layer(input=trans, size=100, bias_attr=False)
-
-mask = data_layer(name='mask', size=100)
-
-hidden_sel = selective_fc_layer(
-    input=din, select=mask, size=100, act=SigmoidActivation())
-
-outputs(hidden, hidden_sel)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
deleted file mode 100644
index afc3e9207..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=256)
-glu = gated_unit_layer(
-    size=512,
-    input=data,
-    act=TanhActivation(),
-    gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
-    gate_param_attr=ParamAttr(initial_std=1e-4),
-    gate_bias_attr=ParamAttr(initial_std=1),
-    inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
-    inproj_param_attr=ParamAttr(initial_std=1e-4),
-    inproj_bias_attr=ParamAttr(initial_std=1),
-    layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
-
-outputs(glu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
deleted file mode 100644
index ac9902d08..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-4)
-
-din = data_layer(name='data', size=120)
-
-outputs(
-    grumemory(
-        input=din,
-        size=40,
-        reverse=True,
-        gate_act=TanhActivation(),
-        act=SigmoidActivation()))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
deleted file mode 100644
index da781c149..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-din = data_layer(name='data', size=100)
-label = data_layer(name='label', size=10)
-
-outputs(hsigmoid(input=din, label=label, num_classes=10))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
deleted file mode 100644
index 171da10f7..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env python
-#coding=utf-8
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name="input_seq", size=128)
-scores = fc_layer(input=data, size=1, act=ExpActivation())
-kmax_seq_id = kmax_seq_score_layer(input=scores, beam_size=5)
-
-outputs(kmax_seq_id)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
deleted file mode 100644
index 42c9b5dee..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-outputs(
-    l2_distance_layer(
-        x=data_layer(
-            name='x', size=128), y=data_layer(
-                name='y', size=128)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
deleted file mode 100644
index 26eeea546..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=128)
-
-outputs(
-    lstmemory(
-        input=din,
-        reverse=True,
-        gate_act=TanhActivation(),
-        act=TanhActivation(),
-        size=32))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
deleted file mode 100644
index 2cd41a306..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2304, height=48, width=48)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=1,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-maxout = maxout_layer(input=conv, num_channels=16, groups=2)
-
-pool = img_pool_layer(
-    input=maxout, num_channels=8, pool_size=2, stride=2, pool_type=MaxPooling())
-
-conv2 = img_conv_layer(
-    input=pool,
-    filter_size=3,
-    num_channels=8,
-    num_filters=128,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-maxout2 = maxout_layer(input=conv2, num_channels=128, groups=4)
-
-block = block_expand_layer(
-    input=maxout2,
-    num_channels=32,
-    stride_x=1,
-    stride_y=1,
-    block_x=1,
-    block_y=6)
-
-fc = fc_layer(input=block, size=384, bias_attr=False)
-
-outputs(fc)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
deleted file mode 100644
index b4fd9052c..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
-
-input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
-
-priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
-
-label = data_layer(name='label', size=24, height=4, width=6)
-
-multibox_loss = multibox_loss_layer(
-    input_loc=input_loc,
-    input_conf=input_conf,
-    priorbox=priorbox,
-    label=label,
-    num_classes=21,
-    overlap_threshold=0.5,
-    neg_pos_ratio=3.0,
-    neg_overlap=0.5,
-    background_id=0,
-    name='test_multibox_loss')
-
-outputs(multibox_loss)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
deleted file mode 100644
index bfba07be8..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-index = data_layer(name='index', size=1)
-din1 = data_layer(name='data1', size=30)
-din2 = data_layer(name='data2', size=30)
-din3 = data_layer(name='data3', size=30)
-
-dout = multiplex_layer([index, din1, din2, din3])
-
-outputs(dout)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
deleted file mode 100644
index 891894172..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-weight = data_layer(name='w', size=1)
-a = data_layer(name='a', size=100)
-b = data_layer(name='b', size=100)
-c = data_layer(name='c', size=200)
-d = data_layer(name='d', size=31)
-
-outputs(
-    interpolation_layer(
-        input=[a, b], weight=weight),
-    power_layer(
-        input=a, weight=weight),
-    scaling_layer(
-        input=a, weight=weight),
-    cos_sim(
-        a=a, b=b),
-    cos_sim(
-        a=a, b=c, size=2),
-    sum_to_one_norm_layer(input=a),
-    conv_shift_layer(
-        a=a, b=d),
-    tensor_layer(
-        a=a, b=b, size=1000),
-    slope_intercept_layer(
-        input=a, slope=0.7, intercept=0.9),
-    linear_comb_layer(
-        weights=b, vectors=c))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
deleted file mode 100644
index c5825c82e..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
+++ /dev/null
@@ -1,34 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2016, height=48, width=42)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=1,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
-
-pad = pad_layer(input=pool, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
-
-outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
deleted file mode 100644
index 5ff52c195..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=100, learning_rate=1e-5)
-
-data_2d = data_layer(name='data_2d', size=6000, height=20, width=10)
-
-pool_2d = img_pool_layer(
-    name="pool___2d",
-    input=data_2d,
-    num_channels=30,
-    pool_size=5,
-    stride=3,
-    padding=1,
-    pool_type=AvgPooling())
-outputs(pool_2d)
-
-data_3d = data_layer(
-    name='data_3d_1', size=60000, depth=10, height=20, width=10)
-
-pool_3d_1 = img_pool3d_layer(
-    name="pool_3d_1",
-    input=data_3d,
-    num_channels=30,
-    pool_size=5,
-    stride=3,
-    padding=1,
-    pool_type=AvgPooling())
-outputs(pool_3d_1)
-
-pool_3d_2 = img_pool3d_layer(
-    name="pool_3d_2",
-    input=data_3d,
-    num_channels=30,
-    pool_size=[5, 5, 5],
-    stride=[3, 3, 3],
-    padding=[1, 1, 1],
-    pool_type=MaxPooling())
-outputs(pool_3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
deleted file mode 100644
index d803a0d13..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300, height=10, width=10)
-prelu = prelu_layer(input=data, num_channels=3)
-prelu = prelu_layer(input=data, partial_sum=1, num_channels=3)
-prelu = prelu_layer(input=data, partial_sum=5, num_channels=3)
-prelu = prelu_layer(input=data, channel_shared=True, num_channels=3)
-prelu = prelu_layer(input=data, channel_shared=False, num_channels=3)
-
-outputs(prelu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
deleted file mode 100644
index ca1f5a457..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-din = data_layer(name='input', size=100)
-
-print_layer(input=din)
-
-outputs(din)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
deleted file mode 100644
index d44870d80..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=100)
-
-enc = din
-for i in range(32):
-    enc = addto_layer([enc, enc])
-
-pred = fc_layer(
-    input=fc_layer(
-        input=enc, size=32, act=ReluActivation()),
-    size=10,
-    act=SoftmaxActivation())
-outputs(pred)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
deleted file mode 100644
index ee90e830d..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din = data_layer(name='data', size=30)
-
-outputs(
-    repeat_layer(
-        input=din, num_repeats=10, as_row_vector=True),
-    repeat_layer(
-        input=din, num_repeats=10, act=TanhActivation(), as_row_vector=False))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
deleted file mode 100644
index 4aa81919d..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300)
-resized = resize_layer(input=data, size=150)
-
-outputs(resized)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
deleted file mode 100644
index 3824ef599..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ /dev/null
@@ -1,62 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-seq = data_layer(name='seq_input', size=100)
-sub_seq = data_layer(name='sub_seq_input', size=100)
-lbl = data_layer(name='label', size=1)
-
-
-def generate_rnn_simple(name):
-    def rnn_simple(s):
-        m = memory(name=name, size=200)
-        fc = fc_layer(input=[s, m], size=200, name=name)
-        return fc
-
-    return rnn_simple
-
-
-def generate_rnn_simple_no_name():
-    def rnn_simple(s):
-        m = memory(name=None, size=200)
-        fc = fc_layer(input=[s, m], size=200)
-        m.set_input(fc)
-        return fc
-
-    return rnn_simple
-
-
-with mixed_layer() as lstm_param:  # test lstm unit, rnn group
-    lstm_param += full_matrix_projection(input=seq, size=100 * 4)
-
-with mixed_layer() as gru_param:
-    gru_param += full_matrix_projection(input=seq, size=100 * 3)
-
-outputs(
-    last_seq(input=recurrent_group(
-        step=generate_rnn_simple('rnn_forward'), input=seq)),
-    first_seq(input=recurrent_group(
-        step=generate_rnn_simple('rnn_back'), input=seq, reverse=True)),
-    last_seq(input=recurrent_group(
-        step=generate_rnn_simple('rnn_subseq_forward'),
-        input=SubsequenceInput(input=sub_seq))),
-    last_seq(input=lstmemory_group(
-        input=lstm_param, size=100)),
-    last_seq(input=gru_group(
-        input=gru_param, size=100)),
-    last_seq(input=recurrent_group(
-        step=generate_rnn_simple_no_name(), input=seq)), )
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
deleted file mode 100644
index 6929d106c..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='data', size=3 * 14 * 14, height=14, width=14)
-
-rois = data_layer(name='rois', size=10)
-
-conv = img_conv_layer(
-    input=data,
-    filter_size=3,
-    num_channels=3,
-    num_filters=16,
-    padding=1,
-    act=LinearActivation(),
-    bias_attr=True)
-
-roi_pool = roi_pool_layer(
-    input=conv,
-    rois=rois,
-    pooled_width=7,
-    pooled_height=7,
-    spatial_scale=1. / 16)
-
-outputs(roi_pool)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
deleted file mode 100644
index 6381a26fe..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2560)
-
-row_conv = row_conv_layer(input=data, context_len=19, act=ReluActivation())
-
-outputs(row_conv)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
deleted file mode 100644
index 3c17d2ccf..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300)
-row_l2_norm = row_l2_norm_layer(input=data)
-
-outputs(row_l2_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
deleted file mode 100644
index ae8a25ba9..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='data', size=100)
-
-scale = scale_shift_layer(input=data, bias_attr=False)
-
-scale_shift = scale_shift_layer(input=data)
-
-outputs(scale, scale_shift)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
deleted file mode 100644
index e4f7120bc..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-data = data_layer(name='data', size=2016, height=48, width=42)
-indices = data_layer(name='indices', size=6)
-
-scale_sub_region = scale_sub_region_layer(
-    input=data, indices=indices, value=0.0)
-
-outputs(scale_sub_region)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
deleted file mode 100644
index a6be069e7..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
+++ /dev/null
@@ -1,26 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=1000, learning_rate=1e-5)
-
-din1 = data_layer(name='data1', size=30)
-din2 = data_layer(name='data2', size=30)
-
-opts = []
-opts.append(seq_concat_layer(a=din1, b=din2))
-opts.append(seq_reshape_layer(input=din1, reshape_size=5))
-
-outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
deleted file mode 100644
index 510ad3220..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env python
-#coding=utf-8
-from paddle.trainer_config_helpers import *
-
-input_seq = data_layer("word", size=128)
-starts = data_layer("starts", size=5)
-ends = data_layer("ends", size=5)
-
-seq_slice1 = seq_slice_layer(input=input_seq, starts=starts, ends=ends)
-seq_slice2 = seq_slice_layer(input=input_seq, starts=starts, ends=None)
-seq_slice3 = seq_slice_layer(input=input_seq, starts=None, ends=ends)
-
-outputs(seq_slice1, seq_slice2, seq_slice3)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
deleted file mode 100644
index 7b951a4cd..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-din = data_layer(name='dat_in', size=100)
-
-POOL_TYPE = [MaxPooling, AvgPooling, SumPooling]
-
-AGG_LEVEL = [AggregateLevel.TO_SEQUENCE, AggregateLevel.TO_NO_SEQUENCE]
-
-opts = []
-
-for pt in POOL_TYPE:
-    for al in AGG_LEVEL:
-        opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
-
-for pt in POOL_TYPE:
-    opts.append(
-        pooling_layer(
-            input=din,
-            agg_level=AggregateLevel.TO_NO_SEQUENCE,
-            pooling_type=pt(),
-            stride=5))
-
-opts.append(
-    pooling_layer(
-        input=din, pooling_type=MaxPooling(output_max_index=True)))
-
-outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
deleted file mode 100644
index 32a4e6f6d..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
+++ /dev/null
@@ -1,21 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-data = data_layer(name='input', size=300)
-lbl = data_layer(name='label', size=300)
-smooth_l1 = smooth_l1_cost(input=data, label=lbl)
-
-outputs(smooth_l1)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py b/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
deleted file mode 100644
index ea68b5493..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-define_py_data_sources2(
-    train_list="train.list",
-    test_list="test.list",
-    module=["a", "b"],
-    obj=("c", "d"))
-settings(learning_rate=1e-3, batch_size=1000)
-
-outputs(data_layer(name="a", size=10))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
deleted file mode 100644
index 0e692d4b6..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=100, learning_rate=1e-5)
-
-data = data_layer(name='data', size=3200, height=20, width=10)
-
-spp = spp_layer(
-    input=data, pyramid_height=2, num_channels=16, pool_type=MaxPooling())
-
-outputs(spp)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
deleted file mode 100644
index 6d1c3175b..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env python
-#coding=utf-8
-from paddle.trainer_config_helpers import *
-
-beam_size = 5
-
-data = data_layer(name='input_seq', size=300)
-selected_ids = data_layer(name='input', size=beam_size)
-sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
-
-outputs(sub_nest_seq)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
deleted file mode 100644
index 8878e73ff..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-settings(batch_size=1000, learning_rate=1e-4)
-
-probs = data_layer(name='probs', size=100)
-
-outputs(
-    sampling_id_layer(input=probs),  # It seems not support training
-
-    # It seems this layer is not correct, and should be rewrite.
-    # block_expand_layer(input=probs, channel=1, block_x=1, block_y=3),
-)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
deleted file mode 100644
index da134f100..000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(learning_rate=1e-4, batch_size=1000)
-
-a = data_layer(name='a', size=10)
-b = data_layer(name='b', size=10)
-
-result = addto_layer(input=[a, b])
-concat1 = concat_layer(input=[a, b])
-concat2 = concat_layer(
-    input=[identity_projection(input=a), identity_projection(input=b)])
-
-outputs(result, concat1, concat2)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
deleted file mode 100644
index b3dd8f8fc..000000000
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config_and_serialize
-
-if __name__ == '__main__':
-    parse_config_and_serialize(
-        'trainer_config_helpers/tests/layers_test_config.py', '')
-# layers_test_config.py
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
deleted file mode 100644
index e6cd35ee7..000000000
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-num_classes = 5
-
-x = data_layer(name="input1", size=3)
-y = data_layer(name="input2", size=5)
-
-z = out_prod_layer(input1=x, input2=y)
-
-x1 = fc_layer(input=x, size=5)
-y1 = fc_layer(input=y, size=5)
-
-z1 = mixed_layer(
-    act=LinearActivation(),
-    input=[
-        conv_operator(
-            img=x1,
-            filter=y1,
-            filter_size=1,
-            num_filters=5,
-            num_channels=5,
-            stride=1)
-    ])
-
-assert z1.size > 0
-
-y2 = fc_layer(input=y, size=15)
-z2 = rotate_layer(input=y2, height=5, width=3)
-
-cos1 = cos_sim(a=x1, b=y1)
-cos3 = cos_sim(a=x1, b=y2, size=3)
-
-linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
-
-out = fc_layer(
-    input=[cos1, cos3, linear_comb, z, z1, z2],
-    size=num_classes,
-    act=SoftmaxActivation())
-
-print_layer(input=[out])
-
-outputs(classification_cost(out, data_layer(name="label", size=num_classes)))
-
-dotmul = mixed_layer(
-    input=[dotmul_operator(
-        a=x1, b=x1), dotmul_projection(input=y1)])
-
-proj_with_attr_init = mixed_layer(
-    input=full_matrix_projection(
-        input=y1,
-        param_attr=ParamAttr(
-            learning_rate=0, initial_mean=0, initial_std=0)),
-    bias_attr=ParamAttr(
-        initial_mean=0, initial_std=0, learning_rate=0),
-    act=LinearActivation(),
-    size=5,
-    name='proj_with_attr_init')
-
-# for ctc
-tmp = fc_layer(
-    input=[x1, dotmul, proj_with_attr_init],
-    size=num_classes + 1,
-    act=SoftmaxActivation())
-ctc = ctc_layer(input=tmp, label=y, size=num_classes + 1)
-ctc_eval = ctc_error_evaluator(input=tmp, label=y)
-
-settings(
-    batch_size=10,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
diff --git a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
deleted file mode 100644
index 4d7542c35..000000000
--- a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+++ /dev/null
@@ -1,29 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-from paddle.trainer.config_parser import parse_config
-
-
-class TestParse(unittest.TestCase):
-    def test_parse(self):
-        a = parse_config('trainer_config_helpers/tests/layers_test_config.py',
-                         '')
-        b = parse_config('trainer_config_helpers/tests/layers_test_config.py',
-                         '')
-        self.assertEqual(a, b)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/trainer_config_helpers/utils.py b/python/paddle/trainer_config_helpers/utils.py
deleted file mode 100644
index fe6e9cd53..000000000
--- a/python/paddle/trainer_config_helpers/utils.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import logger
-import functools
-
-__all__ = ['deprecated']
-
-
-def deprecated(instead):
-    def __impl__(func):
-        @functools.wraps(func)
-        def __wrapper__(*args, **kwargs):
-            logger.warning("The interface %s is deprecated, "
-                           "will be removed soon. Please use %s instead." %
-                           (func.__name__, instead))
-
-            return func(*args, **kwargs)
-
-        return __wrapper__
-
-    return __impl__
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
deleted file mode 100644
index df710c33d..000000000
--- a/python/paddle/v2/__init__.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import optimizer
-import layer
-import activation
-import parameters
-import trainer
-import event
-import data_type
-import topology
-import networks
-import evaluator
-from . import dataset
-from . import reader
-from . import plot
-import attr
-import op
-import pooling
-import inference
-import networks
-import minibatch
-import plot
-import image
-import paddle.trainer.config_parser as cp
-
-__all__ = [
-    'default_startup_program',
-    'default_main_program',
-    'optimizer',
-    'layer',
-    'activation',
-    'parameters',
-    'init',
-    'trainer',
-    'event',
-    'data_type',
-    'attr',
-    'pooling',
-    'dataset',
-    'reader',
-    'topology',
-    'networks',
-    'infer',
-    'plot',
-    'evaluator',
-    'image',
-    'master',
-]
-
-cp.begin_parse()
-
-
-def set_env_vars(trainer_count):
-    '''Auto set CPU environment if have not set before.
-       For MKL:
-         export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
-         export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
-       For OpenBLAS:
-         export OPENBLAS_NUM_THREADS, OPENBLAS_MAIN_FREE according to trainer_count. 
-    '''
-    import platform, paddle
-    if not platform.system() in ['Linux', 'Darwin']:
-        return
-
-    def set_env(key, value):
-        '''If the key has not been set in the environment, set it with value.'''
-        assert isinstance(key, str)
-        assert isinstance(value, str)
-        envset = os.environ.get(key)
-        if envset is None:
-            os.environ[key] = value
-
-    def num_physical_cores():
-        '''Get the number of physical cores'''
-        if platform.system() == "Linux":
-            num_sockets = int(
-                os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
-                .read())
-            num_cores_per_socket = int(
-                os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
-                .read())
-            return num_sockets * num_cores_per_socket
-        else:
-            cmds = {"Darwin": "sysctl -n hw.physicalcpu"}
-            return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
-
-    def num_logical_processors():
-        '''Get the number of logical processors'''
-        cmds = {
-            "Linux": "grep \"processor\" /proc/cpuinfo|sort -u|wc -l",
-            "Darwin": "sysctl -n hw.logicalcpu"
-        }
-        return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
-
-    num_cores = num_physical_cores()
-    num_processors = num_logical_processors()
-    if paddle.version.mkl() == 'ON':
-        if num_processors > num_cores:  # Hyper Threading is enabled
-            set_env("OMP_DYNAMIC", "true")
-            set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
-        else:
-            set_env("OMP_DYNAMIC", "false")
-            set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
-    threads = num_processors / trainer_count
-    threads = '1' if threads < 1 else str(threads)
-    if paddle.version.mkl() == 'ON':
-        set_env("OMP_NUM_THREADS", threads)
-        set_env("MKL_NUM_THREADS", threads)
-    else:
-        set_env("OPENBLAS_NUM_THREADS", threads)
-        if threads > 1:
-            set_env("OPENBLAS_MAIN_FREE", '1')
-
-
-def init(**kwargs):
-    import py_paddle.swig_paddle as api
-    args = []
-    args_dict = {}
-    # NOTE: append arguments if they are in ENV
-    for ek, ev in os.environ.iteritems():
-        if ek.startswith("PADDLE_INIT_"):
-            args_dict[ek.replace("PADDLE_INIT_", "").lower()] = str(ev)
-
-    args_dict.update(kwargs)
-    # NOTE: overwrite arguments from ENV if it is in kwargs
-    for key in args_dict.keys():
-        args.append('--%s=%s' % (key, str(args_dict[key])))
-
-    set_env_vars(kwargs.get('trainer_count', 1))
-
-    if 'use_gpu' in kwargs:
-        cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
-    if 'use_mkldnn' in kwargs:
-        cp.g_command_config_args['use_mkldnn'] = kwargs['use_mkldnn']
-    if 'use_mkl_packed' in kwargs:
-        cp.g_command_config_args['use_mkl_packed'] = kwargs['use_mkl_packed']
-    assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
-                                         "supported in v2 APIs.")
-
-    api.initPaddle(*args)
-
-
-infer = inference.infer
-batch = minibatch.batch
diff --git a/python/paddle/v2/activation.py b/python/paddle/v2/activation.py
deleted file mode 100644
index 21261a178..000000000
--- a/python/paddle/v2/activation.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.activations
-import copy
-
-__all__ = []
-
-suffix = 'Activation'
-for act in paddle.trainer_config_helpers.activations.__all__:
-    new_name = act[:-len(suffix)]
-    globals()[new_name] = copy.copy(
-        getattr(paddle.trainer_config_helpers.activations, act))
-    globals()[new_name].__name__ = new_name
-    __all__.append(new_name)
diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py
deleted file mode 100644
index 5d23894d7..000000000
--- a/python/paddle/v2/attr.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.attrs
-
-__all__ = [
-    "Param",
-    "Extra",
-    "Hook",
-]
-
-Param = paddle.trainer_config_helpers.attrs.ParameterAttribute
-Extra = paddle.trainer_config_helpers.attrs.ExtraLayerAttribute
-Hook = paddle.trainer_config_helpers.attrs.HookAttribute
-
-for each in paddle.trainer_config_helpers.attrs.__all__:
-    globals()[each] = getattr(paddle.trainer_config_helpers.attrs, each)
-    __all__.append(each)
diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py
deleted file mode 100644
index d9613e001..000000000
--- a/python/paddle/v2/config_base.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import re
-import paddle.trainer_config_helpers as conf_helps
-
-__layer_map__ = {}
-
-
-def __map_docstr__(doc, name):
-    if doc is None:
-        return doc
-
-    assert isinstance(doc, basestring)
-
-    # replace LayerOutput to paddle.v2.config_base.Layer
-    doc = doc.replace("LayerOutput", "paddle.v2.config_base.Layer")
-
-    doc = doc.replace('ParameterAttribute', 'paddle.v2.attr.ParameterAttribute')
-
-    doc = re.sub(r'ExtraLayerAttribute[^\s]?', 'paddle.v2.attr.ExtraAttribute',
-                 doc)
-
-    # xxx_layer to xxx
-    doc = re.sub(r"(?P<name>[a-z]+)_layer", r"\g<name>", doc)
-
-    # XxxxActivation to paddle.v2.activation.Xxxx
-    doc = re.sub(r"(?P<name>[A-Z][a-zA-Z]+)Activation",
-                 r"paddle.v2.activation.\g<name>", doc)
-
-    # xxx_evaluator to paddle.v2.evaluator.xxx
-    doc = re.sub(r"(?P<name>[a-z]+)_evaluator", r"evaluator.\g<name>", doc)
-
-    # TODO(yuyang18): Add more rules if needed.
-    return doc
-
-
-def __convert_to_v2__(f, name, module):
-    def wrapped(*args, **xargs):
-        out = f(*args, **xargs)
-        outs = out
-        if not isinstance(out, collections.Sequence):
-            outs = [out]
-        for l in outs:
-            if isinstance(l, conf_helps.LayerOutput):
-                __layer_map__[l.full_name] = l
-        return out
-
-    wrapped.__doc__ = __map_docstr__(f.__doc__, name)
-    wrapped.__name__ = name
-    wrapped.__module__ = module
-
-    return wrapped
-
-
-Layer = conf_helps.LayerOutput
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
deleted file mode 100644
index 98dfb85a0..000000000
--- a/python/paddle/v2/data_feeder.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from py_paddle import DataProviderConverter
-import collections
-import paddle.trainer.PyDataProvider2 as pydp2
-
-__all__ = ['DataFeeder']
-
-
-def default_feeding_map(data_types):
-    reader_dict = dict()
-    for i, tp in enumerate(data_types):
-        reader_dict[tp[0]] = i
-    return reader_dict
-
-
-class DataFeeder(DataProviderConverter):
-    """
-    DataFeeder converts the data returned by paddle.reader into a data structure
-    of Arguments which is defined in the API. The paddle.reader usually returns
-    a list of mini-batch data entries. Each data entry in the list is one sample.
-    Each sample is a list or a tuple with one feature or multiple features.
-    DataFeeder converts this mini-batch data entries into Arguments in order
-    to feed it to C++ interface.
-    
-    The simple usage shows below
-
-    ..  code-block:: python
-
-        feeding = ['image', 'label']
-        data_types = enumerate_data_types_of_data_layers(topology)
-        feeder = DataFeeder(data_types=data_types, feeding=feeding)
-
-        minibatch_data = [([1.0, 2.0, 3.0, ...], 5)]
-
-        arg = feeder(minibatch_data)
-
-
-    If mini-batch data and data layers are not one to one mapping, we
-    could pass a dictionary to feeding parameter to represent the mapping
-    relationship.
-
-
-    ..  code-block:: python
-
-        data_types = [('image', paddle.data_type.dense_vector(784)),
-                      ('label', paddle.data_type.integer_value(10))]
-        feeding = {'image':0, 'label':1}
-        feeder = DataFeeder(data_types=data_types, feeding=feeding)
-        minibatch_data = [
-                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ),  # first sample
-                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] )   # second sample
-                         ]
-        # or minibatch_data = [
-        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ],  # first sample
-        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ]   # second sample
-        #                     ]
-        arg = feeder.convert(minibatch_data)
-
-    ..  note::
-
-        This module is for internal use only. Users should use the `reader`
-        interface.
-
-
-
-    :param data_types: A list to specify data name and type. Each item is
-                       a tuple of (data_name, data_type).
-
-    :type data_types: list
-    :param feeding: A dictionary or a sequence to specify the position of each
-                    data in the input data.
-    :type feeding: dict|collections.Sequence|None
-    """
-
-    def __init__(self, data_types, feeding=None):
-        self.input_names = []
-        input_types = []
-        if feeding is None:
-            feeding = default_feeding_map(data_types)
-        elif isinstance(feeding, collections.Sequence):
-            feed_list = feeding
-            feeding = dict()
-            for i, name in enumerate(feed_list):
-                feeding[name] = i
-        elif not isinstance(feeding, dict):
-            raise TypeError("Feeding should be dict or sequence or None.")
-
-        self.feeding = feeding
-        for each in data_types:
-            self.input_names.append(each[0])
-            if not isinstance(each[1], pydp2.InputType):
-                raise TypeError("second item in each data_type should be an "
-                                "InputType")
-            input_types.append(each[1])
-        DataProviderConverter.__init__(self, input_types)
-
-    def __len__(self):
-        return len(self.input_names)
-
-    def convert(self, dat, argument=None):
-        """
-        :param dat: A list of mini-batch data. Each sample is a list or tuple
-                    one feature or multiple features.
-
-        :type dat: list
-        :param argument: An Arguments object contains this mini-batch data with
-                         one or multiple features. The Arguments definition is
-                         in the API.
-        :type argument: py_paddle.swig_paddle.Arguments
-        """
-
-        def reorder_data(data):
-            retv = []
-            for each in data:
-                reorder = []
-                for name in self.input_names:
-                    reorder.append(each[self.feeding[name]])
-                retv.append(reorder)
-            return retv
-
-        return DataProviderConverter.convert(self, reorder_data(dat), argument)
diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py
deleted file mode 100644
index 226997465..000000000
--- a/python/paddle/v2/data_type.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer.PyDataProvider2 as pydp2
-
-import_list = [
-    nm for nm in dir(pydp2)
-    if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm or
-                                       'array' in nm)
-]
-import_list.extend(['InputType'])
-
-for nm in import_list:
-    globals()[nm] = getattr(pydp2, nm)
-
-__all__ = import_list
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
deleted file mode 100644
index 38056fe0a..000000000
--- a/python/paddle/v2/dataset/__init__.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Dataset package.
-"""
-
-import mnist
-import imikolov
-import imdb
-import cifar
-import movielens
-import conll05
-import uci_housing
-import sentiment
-import wmt14
-import wmt16
-import mq2007
-import flowers
-import voc2012
-
-__all__ = [
-    'mnist',
-    'imikolov',
-    'imdb',
-    'cifar',
-    'movielens',
-    'conll05',
-    'sentiment',
-    'uci_housing',
-    'wmt14',
-    'wmt16',
-    'mq2007',
-    'flowers',
-    'voc2012',
-]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
deleted file mode 100644
index 662655c83..000000000
--- a/python/paddle/v2/dataset/cifar.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-CIFAR dataset.
-
-This module will download dataset from
-https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
-paddle reader creators.
-
-The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
-with 6000 images per class. There are 50000 training images and 10000 test
-images.
-
-The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
-containing 600 images each. There are 500 training images and 100 testing
-images per class.
-
-"""
-
-import cPickle
-import itertools
-import numpy
-import paddle.v2.dataset.common
-import tarfile
-
-__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
-
-URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
-CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
-CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
-CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
-CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
-
-
-def reader_creator(filename, sub_name, cycle=False):
-    def read_batch(batch):
-        data = batch['data']
-        labels = batch.get('labels', batch.get('fine_labels', None))
-        assert labels is not None
-        for sample, label in itertools.izip(data, labels):
-            yield (sample / 255.0).astype(numpy.float32), int(label)
-
-    def reader():
-        with tarfile.open(filename, mode='r') as f:
-            names = (each_item.name for each_item in f
-                     if sub_name in each_item.name)
-
-            while True:
-                for name in names:
-                    batch = cPickle.load(f.extractfile(name))
-                    for item in read_batch(batch):
-                        yield item
-                if not cycle:
-                    break
-
-    return reader
-
-
-def train100():
-    """
-    CIFAR-100 training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 99].
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'train')
-
-
-def test100():
-    """
-    CIFAR-100 test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'test')
-
-
-def train10(cycle=False):
-    """
-    CIFAR-10 training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch',
-        cycle=cycle)
-
-
-def test10(cycle=False):
-    """
-    CIFAR-10 test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch',
-        cycle=cycle)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
-    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
-    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
-    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
-    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
deleted file mode 100644
index c6ff09a1d..000000000
--- a/python/paddle/v2/dataset/common.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import requests
-import hashlib
-import os
-import errno
-import shutil
-import sys
-import importlib
-import paddle.v2.dataset
-import cPickle
-import glob
-import cPickle as pickle
-
-__all__ = [
-    'DATA_HOME',
-    'download',
-    'md5file',
-    'split',
-    'cluster_files_reader',
-    'convert',
-]
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
-
-
-# When running unit tests, there could be multiple processes that
-# trying to create DATA_HOME directory simultaneously, so we cannot
-# use a if condition to check for the existence of the directory;
-# instead, we use the filesystem as the synchronization mechanism by
-# catching returned errors.
-def must_mkdirs(path):
-    try:
-        os.makedirs(DATA_HOME)
-    except OSError as exc:
-        if exc.errno != errno.EEXIST:
-            raise
-        pass
-
-
-must_mkdirs(DATA_HOME)
-
-
-def md5file(fname):
-    hash_md5 = hashlib.md5()
-    f = open(fname, "rb")
-    for chunk in iter(lambda: f.read(4096), b""):
-        hash_md5.update(chunk)
-    f.close()
-    return hash_md5.hexdigest()
-
-
-def download(url, module_name, md5sum, save_name=None):
-    dirname = os.path.join(DATA_HOME, module_name)
-    if not os.path.exists(dirname):
-        os.makedirs(dirname)
-
-    filename = os.path.join(dirname,
-                            url.split('/')[-1]
-                            if save_name is None else save_name)
-
-    retry = 0
-    retry_limit = 3
-    while not (os.path.exists(filename) and md5file(filename) == md5sum):
-        if os.path.exists(filename):
-            print "file md5", md5file(filename), md5sum
-        if retry < retry_limit:
-            retry += 1
-        else:
-            raise RuntimeError("Cannot download {0} within retry limit {1}".
-                               format(url, retry_limit))
-        print "Cache file %s not found, downloading %s" % (filename, url)
-        r = requests.get(url, stream=True)
-        total_length = r.headers.get('content-length')
-
-        if total_length is None:
-            with open(filename, 'w') as f:
-                shutil.copyfileobj(r.raw, f)
-        else:
-            with open(filename, 'w') as f:
-                dl = 0
-                total_length = int(total_length)
-                for data in r.iter_content(chunk_size=4096):
-                    dl += len(data)
-                    f.write(data)
-                    done = int(50 * dl / total_length)
-                    sys.stdout.write("\r[%s%s]" % ('=' * done,
-                                                   ' ' * (50 - done)))
-                    sys.stdout.flush()
-
-    return filename
-
-
-def fetch_all():
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.v2.dataset)):
-        if "fetch" in dir(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
-            getattr(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name),
-                "fetch")()
-
-
-def fetch_all_recordio(path):
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.v2.dataset)):
-        if "convert" in dir(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
-                not module_name == "common":
-            ds_path = os.path.join(path, module_name)
-            must_mkdirs(ds_path)
-            getattr(
-                importlib.import_module("paddle.v2.dataset.%s" % module_name),
-                "convert")(ds_path)
-
-
-def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
-    """
-    you can call the function as:
-
-    split(paddle.v2.dataset.cifar.train10(), line_count=1000,
-        suffix="imikolov-train-%05d.pickle")
-
-    the output files as:
-
-    |-imikolov-train-00000.pickle
-    |-imikolov-train-00001.pickle
-    |- ...
-    |-imikolov-train-00480.pickle
-
-    :param reader: is a reader creator
-    :param line_count: line count for each file
-    :param suffix: the suffix for the output files, should contain "%d"
-                means the id for each file. Default is "%05d.pickle"
-    :param dumper: is a callable function that dump object to file, this
-                function will be called as dumper(obj, f) and obj is the object
-                will be dumped, f is a file object. Default is cPickle.dump.
-    """
-    if not callable(dumper):
-        raise TypeError("dumper should be callable.")
-    lines = []
-    indx_f = 0
-    for i, d in enumerate(reader()):
-        lines.append(d)
-        if i >= line_count and i % line_count == 0:
-            with open(suffix % indx_f, "w") as f:
-                dumper(lines, f)
-                lines = []
-                indx_f += 1
-    if lines:
-        with open(suffix % indx_f, "w") as f:
-            dumper(lines, f)
-
-
-def cluster_files_reader(files_pattern,
-                         trainer_count,
-                         trainer_id,
-                         loader=cPickle.load):
-    """
-    Create a reader that yield element from the given files, select
-    a file set according trainer count and trainer_id
-
-    :param files_pattern: the files which generating by split(...)
-    :param trainer_count: total trainer count
-    :param trainer_id: the trainer rank id
-    :param loader: is a callable function that load object from file, this
-                function will be called as loader(f) and f is a file object.
-                Default is cPickle.load
-    """
-
-    def reader():
-        if not callable(loader):
-            raise TypeError("loader should be callable.")
-        file_list = glob.glob(files_pattern)
-        file_list.sort()
-        my_file_list = []
-        for idx, fn in enumerate(file_list):
-            if idx % trainer_count == trainer_id:
-                print "append file: %s" % fn
-                my_file_list.append(fn)
-        for fn in my_file_list:
-            with open(fn, "r") as f:
-                lines = loader(f)
-                for line in lines:
-                    yield line
-
-    return reader
-
-
-def convert(output_path, reader, line_count, name_prefix):
-    import recordio
-    """
-    Convert data from reader to recordio format files.
-
-    :param output_path: directory in which output files will be saved.
-    :param reader: a data reader, from which the convert program will read
-                   data instances.
-    :param name_prefix: the name prefix of generated files.
-    :param max_lines_to_shuffle: the max lines numbers to shuffle before
-                                 writing.
-    """
-
-    assert line_count >= 1
-    indx_f = 0
-
-    def write_data(indx_f, lines):
-        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
-        writer = recordio.writer(filename)
-        for l in lines:
-            # FIXME(Yancey1989):
-            # dumps with protocol: pickle.HIGHEST_PROTOCOL
-            writer.write(cPickle.dumps(l))
-        writer.close()
-
-    lines = []
-    for i, d in enumerate(reader()):
-        lines.append(d)
-        if i % line_count == 0 and i >= line_count:
-            write_data(indx_f, lines)
-            lines = []
-            indx_f += 1
-            continue
-
-    write_data(indx_f, lines)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
deleted file mode 100644
index 8312900dc..000000000
--- a/python/paddle/v2/dataset/conll05.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Conll05 dataset.
-Paddle semantic role labeling Book and demo use this dataset as an example.
-Because Conll05 is not free in public, the default downloaded URL is test set
-of Conll05 (which is public). Users can change URL and MD5 to their Conll
-dataset. And a pre-trained word vector model based on Wikipedia corpus is used
-to initialize SRL model.
-"""
-
-import tarfile
-import gzip
-import itertools
-import paddle.v2.dataset.common
-
-__all__ = ['test, get_dict', 'get_embedding', 'convert']
-
-DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
-DATA_MD5 = '387719152ae52d60422c016e92a742fc'
-WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
-WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
-VERBDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FverbDict.txt'
-VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
-TRGDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FtargetDict.txt'
-TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
-EMB_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2Femb'
-EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
-
-UNK_IDX = 0
-
-
-def load_label_dict(filename):
-    d = dict()
-    tag_dict = set()
-    with open(filename, 'r') as f:
-        for i, line in enumerate(f):
-            line = line.strip()
-            if line.startswith("B-"):
-                tag_dict.add(line[2:])
-            elif line.startswith("I-"):
-                tag_dict.add(line[2:])
-        index = 0
-        for tag in tag_dict:
-            d["B-" + tag] = index
-            index += 1
-            d["I-" + tag] = index
-            index += 1
-        d["O"] = index
-    return d
-
-
-def load_dict(filename):
-    d = dict()
-    with open(filename, 'r') as f:
-        for i, line in enumerate(f):
-            d[line.strip()] = i
-    return d
-
-
-def corpus_reader(data_path, words_name, props_name):
-    """
-    Read one corpus. It returns an iterator. Each element of
-    this iterator is a tuple including sentence and labels. The sentence is
-    consist of a list of word IDs. The labels include a list of label IDs.
-    :return: a iterator of data.
-    :rtype: iterator
-    """
-
-    def reader():
-        tf = tarfile.open(data_path)
-        wf = tf.extractfile(words_name)
-        pf = tf.extractfile(props_name)
-        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
-                fileobj=pf) as props_file:
-            sentences = []
-            labels = []
-            one_seg = []
-            for word, label in itertools.izip(words_file, props_file):
-                word = word.strip()
-                label = label.strip().split()
-
-                if len(label) == 0:  # end of sentence
-                    for i in xrange(len(one_seg[0])):
-                        a_kind_lable = [x[i] for x in one_seg]
-                        labels.append(a_kind_lable)
-
-                    if len(labels) >= 1:
-                        verb_list = []
-                        for x in labels[0]:
-                            if x != '-':
-                                verb_list.append(x)
-
-                        for i, lbl in enumerate(labels[1:]):
-                            cur_tag = 'O'
-                            is_in_bracket = False
-                            lbl_seq = []
-                            verb_word = ''
-                            for l in lbl:
-                                if l == '*' and is_in_bracket == False:
-                                    lbl_seq.append('O')
-                                elif l == '*' and is_in_bracket == True:
-                                    lbl_seq.append('I-' + cur_tag)
-                                elif l == '*)':
-                                    lbl_seq.append('I-' + cur_tag)
-                                    is_in_bracket = False
-                                elif l.find('(') != -1 and l.find(')') != -1:
-                                    cur_tag = l[1:l.find('*')]
-                                    lbl_seq.append('B-' + cur_tag)
-                                    is_in_bracket = False
-                                elif l.find('(') != -1 and l.find(')') == -1:
-                                    cur_tag = l[1:l.find('*')]
-                                    lbl_seq.append('B-' + cur_tag)
-                                    is_in_bracket = True
-                                else:
-                                    raise RuntimeError('Unexpected label: %s' %
-                                                       l)
-
-                            yield sentences, verb_list[i], lbl_seq
-
-                    sentences = []
-                    labels = []
-                    one_seg = []
-                else:
-                    sentences.append(word)
-                    one_seg.append(label)
-
-        pf.close()
-        wf.close()
-        tf.close()
-
-    return reader
-
-
-def reader_creator(corpus_reader,
-                   word_dict=None,
-                   predicate_dict=None,
-                   label_dict=None):
-    def reader():
-        for sentence, predicate, labels in corpus_reader():
-
-            sen_len = len(sentence)
-
-            verb_index = labels.index('B-V')
-            mark = [0] * len(labels)
-            if verb_index > 0:
-                mark[verb_index - 1] = 1
-                ctx_n1 = sentence[verb_index - 1]
-            else:
-                ctx_n1 = 'bos'
-
-            if verb_index > 1:
-                mark[verb_index - 2] = 1
-                ctx_n2 = sentence[verb_index - 2]
-            else:
-                ctx_n2 = 'bos'
-
-            mark[verb_index] = 1
-            ctx_0 = sentence[verb_index]
-
-            if verb_index < len(labels) - 1:
-                mark[verb_index + 1] = 1
-                ctx_p1 = sentence[verb_index + 1]
-            else:
-                ctx_p1 = 'eos'
-
-            if verb_index < len(labels) - 2:
-                mark[verb_index + 2] = 1
-                ctx_p2 = sentence[verb_index + 2]
-            else:
-                ctx_p2 = 'eos'
-
-            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
-
-            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            pred_idx = [predicate_dict.get(predicate)] * sen_len
-            label_idx = [label_dict.get(w) for w in labels]
-
-            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
-              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
-
-    return reader
-
-
-def get_dict():
-    """
-    Get the word, verb and label dictionary of Wikipedia corpus.
-    """
-    word_dict = load_dict(
-        paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
-                                          WORDDICT_MD5))
-    verb_dict = load_dict(
-        paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
-                                          VERBDICT_MD5))
-    label_dict = load_label_dict(
-        paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
-                                          TRGDICT_MD5))
-    return word_dict, verb_dict, label_dict
-
-
-def get_embedding():
-    """
-    Get the trained word vector based on Wikipedia corpus.
-    """
-    return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
-
-
-def test():
-    """
-    Conll05 test set creator.
-
-    Because the training dataset is not free, the test dataset is used for
-    training. It returns a reader creator, each sample in the reader is nine
-    features, including sentence sequence, predicate, predicate context,
-    predicate context flag and tagged sequence.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    word_dict, verb_dict, label_dict = get_dict()
-    reader = corpus_reader(
-        paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
-        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
-        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
-    return reader_creator(reader, word_dict, verb_dict, label_dict)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
-    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
-    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
-    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
-    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
deleted file mode 100644
index db12076d5..000000000
--- a/python/paddle/v2/dataset/flowers.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module will download dataset from
-http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
-and parse train/test set intopaddle reader creators.
-
-This set contains images of flowers belonging to 102 different categories.
-The images were acquired by searching the web and taking pictures. There are a
-minimum of 40 images for each category.
-
-The database was used in:
-
-Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
- number of classes.Proceedings of the Indian Conference on Computer Vision,
-Graphics and Image Processing (2008)
-http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
-
-"""
-import cPickle
-import itertools
-import functools
-from common import download
-import tarfile
-import scipy.io as scio
-from paddle.v2.image import *
-from paddle.v2.reader import *
-import os
-import numpy as np
-from multiprocessing import cpu_count
-__all__ = ['train', 'test', 'valid']
-
-DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
-LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
-SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
-LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
-SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
-# In official 'readme', tstid is the flag of test data
-# and trnid is the flag of train data. But test data is more than train data.
-# So we exchange the train data and test data.
-TRAIN_FLAG = 'tstid'
-TEST_FLAG = 'trnid'
-VALID_FLAG = 'valid'
-
-
-def default_mapper(is_train, sample):
-    '''
-    map image bytes data to type needed by model input layer
-    '''
-    img, label = sample
-    img = load_image_bytes(img)
-    img = simple_transform(
-        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
-    return img.flatten().astype('float32'), label
-
-
-train_mapper = functools.partial(default_mapper, True)
-test_mapper = functools.partial(default_mapper, False)
-
-
-def reader_creator(data_file,
-                   label_file,
-                   setid_file,
-                   dataset_name,
-                   mapper,
-                   buffered_size=1024,
-                   use_xmap=True,
-                   cycle=False):
-    '''
-    1. read images from tar file and
-        merge images into batch files in 102flowers.tgz_batch/
-    2. get a reader to read sample from batch file
-
-    :param data_file: downloaded data file
-    :type data_file: string
-    :param label_file: downloaded label file
-    :type label_file: string
-    :param setid_file: downloaded setid file containing information
-                        about how to split dataset
-    :type setid_file: string
-    :param dataset_name: data set name (tstid|trnid|valid)
-    :type dataset_name: string
-    :param mapper: a function to map image bytes data to type
-                    needed by model input layer
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: data reader
-    :rtype: callable
-    '''
-    labels = scio.loadmat(label_file)['labels'][0]
-    indexes = scio.loadmat(setid_file)[dataset_name][0]
-    img2label = {}
-    for i in indexes:
-        img = "jpg/image_%05d.jpg" % i
-        img2label[img] = labels[i - 1]
-    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
-
-    def reader():
-        while True:
-            for file in open(file_list):
-                file = file.strip()
-                batch = None
-                with open(file, 'r') as f:
-                    batch = cPickle.load(f)
-                data = batch['data']
-                labels = batch['label']
-                for sample, label in itertools.izip(data, batch['label']):
-                    yield sample, int(label) - 1
-            if not cycle:
-                break
-
-    if use_xmap:
-        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
-        return xmap_readers(mapper, reader, cpu_num, buffered_size)
-    else:
-        return map_readers(mapper, reader)
-
-
-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
-    '''
-    Create flowers training set reader.
-    It returns a reader, each sample in the reader is
-    image pixels in [0, 1] and label in [1, 102]
-    translated from original color image by steps:
-    1. resize to 256*256
-    2. random crop to 224*224
-    3. flatten
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: train data reader
-    :rtype: callable
-    '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5),
-        TRAIN_FLAG,
-        mapper,
-        buffered_size,
-        use_xmap,
-        cycle=cycle)
-
-
-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
-    '''
-    Create flowers test set reader.
-    It returns a reader, each sample in the reader is
-    image pixels in [0, 1] and label in [1, 102]
-    translated from original color image by steps:
-    1. resize to 256*256
-    2. random crop to 224*224
-    3. flatten
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :param cycle: whether to cycle through the dataset
-    :type cycle: bool
-    :return: test data reader
-    :rtype: callable
-    '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5),
-        TEST_FLAG,
-        mapper,
-        buffered_size,
-        use_xmap,
-        cycle=cycle)
-
-
-def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
-    '''
-    Create flowers validation set reader.
-    It returns a reader, each sample in the reader is
-    image pixels in [0, 1] and label in [1, 102]
-    translated from original color image by steps:
-    1. resize to 256*256
-    2. random crop to 224*224
-    3. flatten
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param buffered_size: the size of buffer used to process images
-    :type buffered_size: int
-    :return: test data reader
-    :rtype: callable
-    '''
-    return reader_creator(
-        download(DATA_URL, 'flowers', DATA_MD5),
-        download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
-        buffered_size, use_xmap)
-
-
-def fetch():
-    download(DATA_URL, 'flowers', DATA_MD5)
-    download(LABEL_URL, 'flowers', LABEL_MD5)
-    download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
deleted file mode 100644
index 00c2a3b99..000000000
--- a/python/paddle/v2/dataset/imdb.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-IMDB dataset.
-
-This module downloads IMDB dataset from
-http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
-of 25,000 highly polar movie reviews for training, and 25,000 for testing.
-Besides, this module also provides API for building dictionary.
-"""
-
-import paddle.v2.dataset.common
-import collections
-import tarfile
-import re
-import string
-
-__all__ = ['build_dict', 'train', 'test', 'convert']
-
-URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
-MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
-
-
-def tokenize(pattern):
-    """
-    Read files that match the given pattern.  Tokenize and yield each file.
-    """
-
-    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
-                                                        MD5)) as tarf:
-        # Note that we should use tarfile.next(), which does
-        # sequential access of member files, other than
-        # tarfile.extractfile, which does random access and might
-        # destroy hard disks.
-        tf = tarf.next()
-        while tf != None:
-            if bool(pattern.match(tf.name)):
-                # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
-                    None, string.punctuation).lower().split()
-            tf = tarf.next()
-
-
-def build_dict(pattern, cutoff):
-    """
-    Build a word dictionary from the corpus. Keys of the dictionary are words,
-    and values are zero-based IDs of these words.
-    """
-    word_freq = collections.defaultdict(int)
-    for doc in tokenize(pattern):
-        for word in doc:
-            word_freq[word] += 1
-
-    # Not sure if we should prune less-frequent words here.
-    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
-
-    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
-    words, _ = list(zip(*dictionary))
-    word_idx = dict(zip(words, xrange(len(words))))
-    word_idx['<unk>'] = len(words)
-    return word_idx
-
-
-def reader_creator(pos_pattern, neg_pattern, word_idx):
-    UNK = word_idx['<unk>']
-    INS = []
-
-    def load(pattern, out, label):
-        for doc in tokenize(pattern):
-            out.append(([word_idx.get(w, UNK) for w in doc], label))
-
-    load(pos_pattern, INS, 0)
-    load(neg_pattern, INS, 1)
-
-    def reader():
-        for doc, label in INS:
-            yield doc, label
-
-    return reader
-
-
-def train(word_idx):
-    """
-    IMDB training set creator.
-
-    It returns a reader creator, each sample in the reader is an zero-based ID
-    sequence and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        re.compile("aclImdb/train/pos/.*\.txt$"),
-        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
-
-
-def test(word_idx):
-    """
-    IMDB test set creator.
-
-    It returns a reader creator, each sample in the reader is an zero-based ID
-    sequence and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        re.compile("aclImdb/test/pos/.*\.txt$"),
-        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
-
-
-def word_dict(cutoff=150):
-    """
-    Build a word dictionary from the corpus.
-
-    :return: Word dictionary
-    :rtype: dict
-    """
-    return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), cutoff)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    w = word_dict()
-    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
-    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
deleted file mode 100644
index 617c722c4..000000000
--- a/python/paddle/v2/dataset/imikolov.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-imikolov's simple dataset.
-
-This module will download dataset from 
-http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
-into paddle reader creators.
-"""
-import paddle.v2.dataset.common
-import collections
-import tarfile
-
-__all__ = ['train', 'test', 'build_dict', 'convert']
-
-URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
-MD5 = '30177ea32e27c525793142b6bf2c8e2d'
-
-
-class DataType(object):
-    NGRAM = 1
-    SEQ = 2
-
-
-def word_count(f, word_freq=None):
-    if word_freq is None:
-        word_freq = collections.defaultdict(int)
-
-    for l in f:
-        for w in l.strip().split():
-            word_freq[w] += 1
-        word_freq['<s>'] += 1
-        word_freq['<e>'] += 1
-
-    return word_freq
-
-
-def build_dict(min_word_freq=50):
-    """
-    Build a word dictionary from the corpus,  Keys of the dictionary are words,
-    and values are zero-based IDs of these words.
-    """
-    train_filename = './simple-examples/data/ptb.train.txt'
-    test_filename = './simple-examples/data/ptb.valid.txt'
-    with tarfile.open(
-            paddle.v2.dataset.common.download(
-                paddle.v2.dataset.imikolov.URL, 'imikolov',
-                paddle.v2.dataset.imikolov.MD5)) as tf:
-        trainf = tf.extractfile(train_filename)
-        testf = tf.extractfile(test_filename)
-        word_freq = word_count(testf, word_count(trainf))
-        if '<unk>' in word_freq:
-            # remove <unk> for now, since we will set it as last index
-            del word_freq['<unk>']
-
-        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
-
-        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
-        words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(zip(words, xrange(len(words))))
-        word_idx['<unk>'] = len(words)
-
-    return word_idx
-
-
-def reader_creator(filename, word_idx, n, data_type):
-    def reader():
-        with tarfile.open(
-                paddle.v2.dataset.common.download(
-                    paddle.v2.dataset.imikolov.URL, 'imikolov',
-                    paddle.v2.dataset.imikolov.MD5)) as tf:
-            f = tf.extractfile(filename)
-
-            UNK = word_idx['<unk>']
-            for l in f:
-                if DataType.NGRAM == data_type:
-                    assert n > -1, 'Invalid gram length'
-                    l = ['<s>'] + l.strip().split() + ['<e>']
-                    if len(l) >= n:
-                        l = [word_idx.get(w, UNK) for w in l]
-                        for i in range(n, len(l) + 1):
-                            yield tuple(l[i - n:i])
-                elif DataType.SEQ == data_type:
-                    l = l.strip().split()
-                    l = [word_idx.get(w, UNK) for w in l]
-                    src_seq = [word_idx['<s>']] + l
-                    trg_seq = l + [word_idx['<e>']]
-                    if n > 0 and len(src_seq) > n: continue
-                    yield src_seq, trg_seq
-                else:
-                    assert False, 'Unknow data type'
-
-    return reader
-
-
-def train(word_idx, n, data_type=DataType.NGRAM):
-    """
-    imikolov training set creator.
-
-    It returns a reader creator, each sample in the reader is a word ID
-    tuple.
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :param n: sliding window size if type is ngram, otherwise max length of sequence
-    :type n: int
-    :param data_type: data type (ngram or sequence)
-    :type data_type: member variable of DataType (NGRAM or SEQ)
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
-                          data_type)
-
-
-def test(word_idx, n, data_type=DataType.NGRAM):
-    """
-    imikolov test set creator.
-
-    It returns a reader creator, each sample in the reader is a word ID
-    tuple.
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :param n: sliding window size if type is ngram, otherwise max length of sequence
-    :type n: int
-    :param data_type: data type (ngram or sequence)
-    :type data_type: member variable of DataType (NGRAM or SEQ)
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
-                          data_type)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    N = 5
-    word_dict = build_dict()
-    paddle.v2.dataset.common.convert(path,
-                                     train(word_dict, N), 1000,
-                                     "imikolov_train")
-    paddle.v2.dataset.common.convert(path,
-                                     test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
deleted file mode 100644
index 026cf501c..000000000
--- a/python/paddle/v2/dataset/mnist.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-MNIST dataset.
-
-This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
-parse training set and test set into paddle reader creators.
-"""
-import paddle.v2.dataset.common
-import subprocess
-import numpy
-import platform
-__all__ = ['train', 'test', 'convert']
-
-URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
-TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
-TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
-TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
-TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
-TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
-TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
-TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
-TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
-
-
-def reader_creator(image_filename, label_filename, buffer_size):
-    def reader():
-        if platform.system() == 'Darwin':
-            zcat_cmd = 'gzcat'
-        elif platform.system() == 'Linux':
-            zcat_cmd = 'zcat'
-        else:
-            raise NotImplementedError()
-
-        # According to http://stackoverflow.com/a/38061619/724872, we
-        # cannot use standard package gzip here.
-        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
-        m.stdout.read(16)  # skip some magic bytes
-
-        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
-        l.stdout.read(8)  # skip some magic bytes
-
-        try:  # reader could be break.
-            while True:
-                labels = numpy.fromfile(
-                    l.stdout, 'ubyte', count=buffer_size).astype("int")
-
-                if labels.size != buffer_size:
-                    break  # numpy.fromfile returns empty slice after EOF.
-
-                images = numpy.fromfile(
-                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
-                        (buffer_size, 28 * 28)).astype('float32')
-
-                images = images / 255.0 * 2.0 - 1.0
-
-                for i in xrange(buffer_size):
-                    yield images[i, :], int(labels[i])
-        finally:
-            try:
-                m.terminate()
-            except:
-                pass
-            try:
-                l.terminate()
-            except:
-                pass
-
-    return reader
-
-
-def train():
-    """
-    MNIST training set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
-                                          TRAIN_IMAGE_MD5),
-        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
-                                          TRAIN_LABEL_MD5), 100)
-
-
-def test():
-    """
-    MNIST test set creator.
-
-    It returns a reader creator, each sample in the reader is image pixels in
-    [0, 1] and label in [0, 9].
-
-    :return: Test reader creator.
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
-                                          TEST_IMAGE_MD5),
-        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
-                                          TEST_LABEL_MD5), 100)
-
-
-def fetch():
-    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
-    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
deleted file mode 100644
index 5b61a9420..000000000
--- a/python/paddle/v2/dataset/movielens.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Movielens 1-M dataset.
-
-Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
-movies, which was collected by GroupLens Research. This module will download
-Movielens 1-M dataset from 
-http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
-set and test set into paddle reader creators.
-
-"""
-
-import zipfile
-import paddle.v2.dataset.common
-import re
-import random
-import functools
-
-__all__ = [
-    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
-    'convert'
-]
-
-age_table = [1, 18, 25, 35, 45, 50, 56]
-
-URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
-MD5 = 'c4d9eecfca2ab87c1945afe126590906'
-
-
-class MovieInfo(object):
-    """
-    Movie id, title and categories information are stored in MovieInfo.
-    """
-
-    def __init__(self, index, categories, title):
-        self.index = int(index)
-        self.categories = categories
-        self.title = title
-
-    def value(self):
-        """
-        Get information from a movie.
-        """
-        return [
-            self.index, [CATEGORIES_DICT[c] for c in self.categories],
-            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
-        ]
-
-    def __str__(self):
-        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
-            self.index, self.title, self.categories)
-
-    def __repr__(self):
-        return self.__str__()
-
-
-class UserInfo(object):
-    """
-    User id, gender, age, and job information are stored in UserInfo.
-    """
-
-    def __init__(self, index, gender, age, job_id):
-        self.index = int(index)
-        self.is_male = gender == 'M'
-        self.age = age_table.index(int(age))
-        self.job_id = int(job_id)
-
-    def value(self):
-        """
-        Get information from a user.
-        """
-        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
-
-    def __str__(self):
-        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
-            self.index, "M"
-            if self.is_male else "F", age_table[self.age], self.job_id)
-
-    def __repr__(self):
-        return str(self)
-
-
-MOVIE_INFO = None
-MOVIE_TITLE_DICT = None
-CATEGORIES_DICT = None
-USER_INFO = None
-
-
-def __initialize_meta_info__():
-    fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
-    global MOVIE_INFO
-    if MOVIE_INFO is None:
-        pattern = re.compile(r'^(.*)\((\d+)\)$')
-        with zipfile.ZipFile(file=fn) as package:
-            for info in package.infolist():
-                assert isinstance(info, zipfile.ZipInfo)
-                MOVIE_INFO = dict()
-                title_word_set = set()
-                categories_set = set()
-                with package.open('ml-1m/movies.dat') as movie_file:
-                    for i, line in enumerate(movie_file):
-                        movie_id, title, categories = line.strip().split('::')
-                        categories = categories.split('|')
-                        for c in categories:
-                            categories_set.add(c)
-                        title = pattern.match(title).group(1)
-                        MOVIE_INFO[int(movie_id)] = MovieInfo(
-                            index=movie_id, categories=categories, title=title)
-                        for w in title.split():
-                            title_word_set.add(w.lower())
-
-                global MOVIE_TITLE_DICT
-                MOVIE_TITLE_DICT = dict()
-                for i, w in enumerate(title_word_set):
-                    MOVIE_TITLE_DICT[w] = i
-
-                global CATEGORIES_DICT
-                CATEGORIES_DICT = dict()
-                for i, c in enumerate(categories_set):
-                    CATEGORIES_DICT[c] = i
-
-                global USER_INFO
-                USER_INFO = dict()
-                with package.open('ml-1m/users.dat') as user_file:
-                    for line in user_file:
-                        uid, gender, age, job, _ = line.strip().split("::")
-                        USER_INFO[int(uid)] = UserInfo(
-                            index=uid, gender=gender, age=age, job_id=job)
-    return fn
-
-
-def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
-    fn = __initialize_meta_info__()
-    rand = random.Random(x=rand_seed)
-    with zipfile.ZipFile(file=fn) as package:
-        with package.open('ml-1m/ratings.dat') as rating:
-            for line in rating:
-                if (rand.random() < test_ratio) == is_test:
-                    uid, mov_id, rating, _ = line.strip().split("::")
-                    uid = int(uid)
-                    mov_id = int(mov_id)
-                    rating = float(rating) * 2 - 5.0
-
-                    mov = MOVIE_INFO[mov_id]
-                    usr = USER_INFO[uid]
-                    yield usr.value() + mov.value() + [[rating]]
-
-
-def __reader_creator__(**kwargs):
-    return lambda: __reader__(**kwargs)
-
-
-train = functools.partial(__reader_creator__, is_test=False)
-test = functools.partial(__reader_creator__, is_test=True)
-
-
-def get_movie_title_dict():
-    """
-    Get movie title dictionary.
-    """
-    __initialize_meta_info__()
-    return MOVIE_TITLE_DICT
-
-
-def __max_index_info__(a, b):
-    if a.index > b.index:
-        return a
-    else:
-        return b
-
-
-def max_movie_id():
-    """
-    Get the maximum value of movie id.
-    """
-    __initialize_meta_info__()
-    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
-
-
-def max_user_id():
-    """
-    Get the maximum value of user id.
-    """
-    __initialize_meta_info__()
-    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
-
-
-def __max_job_id_impl__(a, b):
-    if a.job_id > b.job_id:
-        return a
-    else:
-        return b
-
-
-def max_job_id():
-    """
-    Get the maximum value of job id.
-    """
-    __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
-
-
-def movie_categories():
-    """
-    Get movie categoriges dictionary.
-    """
-    __initialize_meta_info__()
-    return CATEGORIES_DICT
-
-
-def user_info():
-    """
-    Get user info dictionary.
-    """
-    __initialize_meta_info__()
-    return USER_INFO
-
-
-def movie_info():
-    """
-    Get movie info dictionary.
-    """
-    __initialize_meta_info__()
-    return MOVIE_INFO
-
-
-def unittest():
-    for train_count, _ in enumerate(train()()):
-        pass
-    for test_count, _ in enumerate(test()()):
-        pass
-
-    print train_count, test_count
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL, "movielens", MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
-
-
-if __name__ == '__main__':
-    unittest()
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
deleted file mode 100644
index d3b3dd524..000000000
--- a/python/paddle/v2/dataset/mq2007.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-MQ2007 dataset
-
-MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
-validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
-validation set and testing set.
-
-MQ2007 dataset from website
-http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
-
-"""
-
-import os
-import functools
-import rarfile
-from common import download
-import numpy as np
-
-# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
-URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
-MD5 = "7be1640ae95c6408dab0ae7207bdc706"
-
-
-def __initialize_meta_info__():
-    """
-  download and extract the MQ2007 dataset
-  """
-    fn = fetch()
-    rar = rarfile.RarFile(fn)
-    dirpath = os.path.dirname(fn)
-    rar.extractall(path=dirpath)
-    return dirpath
-
-
-class Query(object):
-    """
-  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
-
-  Parameters:
-  ----------
-  query_id : int
-    query_id in dataset, mapping from query to relevance documents
-  relevance_score : int 
-    relevance score of query and document pair
-  feature_vector : array, dense feature
-    feature in vector format
-  description : string
-    comment section in query doc pair data
-  """
-
-    def __init__(self,
-                 query_id=-1,
-                 relevance_score=-1,
-                 feature_vector=None,
-                 description=""):
-        self.query_id = query_id
-        self.relevance_score = relevance_score
-        if feature_vector is None:
-            self.feature_vector = []
-        else:
-            self.feature_vector = feature_vector
-        self.description = description
-
-    def __str__(self):
-        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
-                               " ".join(str(f) for f in self.feature_vector))
-        return string
-
-    # @classmethod
-    def _parse_(self, text):
-        """
-    parse line into Query
-    """
-        comment_position = text.find('#')
-        line = text[:comment_position].strip()
-        self.description = text[comment_position + 1:].strip()
-        parts = line.split()
-        if len(parts) != 48:
-            sys.stdout.write("expect 48 space split parts, get %d" %
-                             (len(parts)))
-            return None
-        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
-        self.relevance_score = int(parts[0])
-        self.query_id = int(parts[1].split(':')[1])
-        for p in parts[2:]:
-            pair = p.split(':')
-            self.feature_vector.append(float(pair[1]))
-        return self
-
-
-class QueryList(object):
-    """
-  group query into list, every item in list is a Query
-  """
-
-    def __init__(self, querylist=None):
-        self.query_id = -1
-        if querylist is None:
-            self.querylist = []
-        else:
-            self.querylist = querylist
-            for query in self.querylist:
-                if self.query_id == -1:
-                    self.query_id = query.query_id
-                else:
-                    if self.query_id != query.query_id:
-                        raise ValueError("query in list must be same query_id")
-
-    def __iter__(self):
-        for query in self.querylist:
-            yield query
-
-    def __len__(self):
-        return len(self.querylist)
-
-    def __getitem__(self, i):
-        return self.querylist[i]
-
-    def _correct_ranking_(self):
-        if self.querylist is None:
-            return
-        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
-
-    def _add_query(self, query):
-        if self.query_id == -1:
-            self.query_id = query.query_id
-        else:
-            if self.query_id != query.query_id:
-                raise ValueError("query in list must be same query_id")
-        self.querylist.append(query)
-
-
-def gen_plain_txt(querylist):
-    """
-  gen plain text in list for other usage
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-
-  return :
-  ------
-  query_id : np.array, shape=(samples_num, )
-  label : np.array, shape=(samples_num, )
-  querylist : np.array, shape=(samples_num, feature_dimension)
-    """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    for query in querylist:
-        yield querylist.query_id, query.relevance_score, np.array(
-            query.feature_vector)
-
-
-def gen_point(querylist):
-    """
-  gen item in list for point-wise learning to rank algorithm
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-
-  return :
-  ------
-  label : np.array, shape=(samples_num, )
-  querylist : np.array, shape=(samples_num, feature_dimension)
-  """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    for query in querylist:
-        yield query.relevance_score, np.array(query.feature_vector)
-
-
-def gen_pair(querylist, partial_order="full"):
-    """
-  gen pair for pair-wise learning to rank algorithm
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-  pairtial_order : "full" or "neighbour"
-    there is redudant in all possiable pair combinations, which can be simplifed
-  gen pairs for neighbour items or the full partial order pairs
-
-  return :
-  ------
-  label : np.array, shape=(1)
-  query_left : np.array, shape=(1, feature_dimension)
-  query_right : same as left
-  """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    labels = []
-    docpairs = []
-
-    # C(n,2)
-    for i in range(len(querylist)):
-        query_left = querylist[i]
-        for j in range(i + 1, len(querylist)):
-            query_right = querylist[j]
-            if query_left.relevance_score > query_right.relevance_score:
-                labels.append([1])
-                docpairs.append([
-                    np.array(query_left.feature_vector),
-                    np.array(query_right.feature_vector)
-                ])
-            elif query_left.relevance_score < query_right.relevance_score:
-                labels.append([1])
-                docpairs.append([
-                    np.array(query_right.feature_vector),
-                    np.array(query_left.feature_vector)
-                ])
-    for label, pair in zip(labels, docpairs):
-        yield np.array(label), pair[0], pair[1]
-
-
-def gen_list(querylist):
-    """
-  gen item in list for list-wise learning to rank algorithm
-  Paramters:
-  --------
-  querylist : querylist, one query match many docment pairs in list, see QueryList
-
-  return :
-  ------
-  label : np.array, shape=(samples_num, )
-  querylist : np.array, shape=(samples_num, feature_dimension)
-  """
-    if not isinstance(querylist, QueryList):
-        querylist = QueryList(querylist)
-    querylist._correct_ranking_()
-    relevance_score_list = [[query.relevance_score] for query in querylist]
-    feature_vector_list = [query.feature_vector for query in querylist]
-    yield np.array(relevance_score_list), np.array(feature_vector_list)
-
-
-def query_filter(querylists):
-    """
-    filter query get only document with label 0.
-    label 0, 1, 2 means the relevance score document with query
-    parameters :
-      querylist : QueyList list
-
-    return :
-      querylist : QueyList list
-    """
-    filter_query = []
-    for querylist in querylists:
-        relevance_score_list = [query.relevance_score for query in querylist]
-        if sum(relevance_score_list) != .0:
-            filter_query.append(querylist)
-    return filter_query
-
-
-def load_from_text(filepath, shuffle=False, fill_missing=-1):
-    """
-  parse data file into querys
-  """
-    prev_query_id = -1
-    querylists = []
-    querylist = None
-    fn = __initialize_meta_info__()
-    with open(os.path.join(fn, filepath)) as f:
-        for line in f:
-            query = Query()
-            query = query._parse_(line)
-            if query == None:
-                continue
-            if query.query_id != prev_query_id:
-                if querylist is not None:
-                    querylists.append(querylist)
-                querylist = QueryList()
-                prev_query_id = query.query_id
-            querylist._add_query(query)
-    if querylist is not None:
-        querylists.append(querylist)
-    return querylists
-
-
-def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
-    """
-  Parameters
-  --------
-  filename : string
-  fill_missing : fill the missing value. default in MQ2007 is -1
-  
-  Returns
-  ------
-  yield
-    label query_left, query_right  # format = "pairwise"
-    label querylist # format = "listwise"
-  """
-    querylists = query_filter(
-        load_from_text(
-            filepath, shuffle=shuffle, fill_missing=fill_missing))
-    for querylist in querylists:
-        if format == "plain_txt":
-            yield next(gen_plain_txt(querylist))
-        elif format == "pointwise":
-            yield next(gen_point(querylist))
-        elif format == "pairwise":
-            for pair in gen_pair(querylist):
-                yield pair
-        elif format == "listwise":
-            yield next(gen_list(querylist))
-
-
-train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
-test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
-
-
-def fetch():
-    return download(URL, "MQ2007", MD5)
-
-
-if __name__ == "__main__":
-    fetch()
-    mytest = functools.partial(
-        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
-    for label, query in mytest():
-        print label, query
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
deleted file mode 100644
index b0b9757c1..000000000
--- a/python/paddle/v2/dataset/sentiment.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-The script fetch and preprocess movie_reviews data set that provided by NLTK
-
-TODO(yuyang18): Complete dataset.
-"""
-
-import collections
-from itertools import chain
-
-import nltk
-from nltk.corpus import movie_reviews
-
-import paddle.v2.dataset.common
-
-__all__ = ['train', 'test', 'get_word_dict', 'convert']
-NUM_TRAINING_INSTANCES = 1600
-NUM_TOTAL_INSTANCES = 2000
-
-
-def download_data_if_not_yet():
-    """
-    Download the data set, if the data set is not download.
-    """
-    try:
-        # make sure that nltk can find the data
-        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
-            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
-        movie_reviews.categories()
-    except LookupError:
-        print "Downloading movie_reviews data set, please wait....."
-        nltk.download(
-            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
-        print "Download data set success....."
-        print "Path is " + nltk.data.find('corpora/movie_reviews').path
-
-
-def get_word_dict():
-    """
-    Sorted the words by the frequency of words which occur in sample
-    :return:
-        words_freq_sorted
-    """
-    words_freq_sorted = list()
-    word_freq_dict = collections.defaultdict(int)
-    download_data_if_not_yet()
-
-    for category in movie_reviews.categories():
-        for field in movie_reviews.fileids(category):
-            for words in movie_reviews.words(field):
-                word_freq_dict[words] += 1
-    words_sort_list = word_freq_dict.items()
-    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
-    for index, word in enumerate(words_sort_list):
-        words_freq_sorted.append((word[0], index))
-    return words_freq_sorted
-
-
-def sort_files():
-    """
-    Sorted the sample for cross reading the sample
-    :return:
-        files_list
-    """
-    files_list = list()
-    neg_file_list = movie_reviews.fileids('neg')
-    pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
-    return files_list
-
-
-def load_sentiment_data():
-    """
-    Load the data set
-    :return:
-        data_set
-    """
-    data_set = list()
-    download_data_if_not_yet()
-    words_ids = dict(get_word_dict())
-    for sample_file in sort_files():
-        words_list = list()
-        category = 0 if 'neg' in sample_file else 1
-        for word in movie_reviews.words(sample_file):
-            words_list.append(words_ids[word.lower()])
-        data_set.append((words_list, category))
-    return data_set
-
-
-def reader_creator(data):
-    """
-    Reader creator, generate an iterator for data set
-    :param data:
-        train data set or test data set
-    """
-    for each in data:
-        yield each[0], each[1]
-
-
-def train():
-    """
-    Default training set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
-
-
-def test():
-    """
-    Default test set reader creator
-    """
-    data_set = load_sentiment_data()
-    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
-
-
-def fetch():
-    nltk.download(
-        'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
-    paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py
deleted file mode 100644
index e0e18229d..000000000
--- a/python/paddle/v2/dataset/tests/cifar_test.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.cifar
-import unittest
-
-
-class TestCIFAR(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        for l in reader():
-            self.assertEqual(l[0].size, 3072)
-            if l[1] > label:
-                label = l[1]
-            sum += 1
-        return sum, label
-
-    def test_test10(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.test10())
-        self.assertEqual(instances, 10000)
-        self.assertEqual(max_label_value, 9)
-
-    def test_train10(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.train10())
-        self.assertEqual(instances, 50000)
-        self.assertEqual(max_label_value, 9)
-
-    def test_test100(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.test100())
-        self.assertEqual(instances, 10000)
-        self.assertEqual(max_label_value, 99)
-
-    def test_train100(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.cifar.train100())
-        self.assertEqual(instances, 50000)
-        self.assertEqual(max_label_value, 99)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
deleted file mode 100644
index cfa194eba..000000000
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.common
-import unittest
-import tempfile
-import glob
-
-
-class TestCommon(unittest.TestCase):
-    def test_md5file(self):
-        _, temp_path = tempfile.mkstemp()
-        with open(temp_path, 'w') as f:
-            f.write("Hello\n")
-        self.assertEqual('09f7e02f1290be211da707a266f153b3',
-                         paddle.v2.dataset.common.md5file(temp_path))
-
-    def test_download(self):
-        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
-        self.assertEqual(
-            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
-            paddle.v2.dataset.common.download(
-                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
-
-    def test_split(self):
-        def test_reader():
-            def reader():
-                for x in xrange(10):
-                    yield x
-
-            return reader
-
-        _, temp_path = tempfile.mkstemp()
-        paddle.v2.dataset.common.split(
-            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
-        files = glob.glob(temp_path + '/test-%05d.pickle')
-        self.assertEqual(len(files), 3)
-
-    def test_cluster_file_reader(self):
-        _, temp_path = tempfile.mkstemp()
-        for x in xrange(5):
-            with open(temp_path + '/%05d.test' % x) as f:
-                f.write('%d\n' % x)
-        reader = paddle.v2.dataset.common.cluster_files_reader(
-            temp_path + '/*.test', 5, 0)
-        for idx, e in enumerate(reader()):
-            self.assertEqual(e, str("0"))
-
-    def test_convert(self):
-        record_num = 10
-        num_shards = 4
-
-        def test_reader():
-            def reader():
-                for x in xrange(record_num):
-                    yield x
-
-            return reader
-
-        path = tempfile.mkdtemp()
-        paddle.v2.dataset.common.convert(path,
-                                         test_reader(), num_shards,
-                                         'random_images')
-
-        files = glob.glob(path + '/random_images-*')
-        self.assertEqual(len(files), num_shards)
-
-        recs = []
-        for i in range(0, num_shards):
-            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
-            r = recordio.reader(n)
-            while True:
-                d = r.read()
-                if d is None:
-                    break
-                recs.append(d)
-
-        recs.sort()
-        self.assertEqual(total, record_num)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py
deleted file mode 100644
index a8ae9a07a..000000000
--- a/python/paddle/v2/dataset/tests/flowers_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.flowers
-import unittest
-
-
-class TestFlowers(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        size = 224 * 224 * 3
-        for l in reader():
-            self.assertEqual(l[0].size, size)
-            if l[1] > label:
-                label = l[1]
-            sum += 1
-        return sum, label
-
-    def test_train(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.flowers.train())
-        self.assertEqual(instances, 6149)
-        self.assertEqual(max_label_value, 102)
-
-    def test_test(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.flowers.test())
-        self.assertEqual(instances, 1020)
-        self.assertEqual(max_label_value, 102)
-
-    def test_valid(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.flowers.valid())
-        self.assertEqual(instances, 1020)
-        self.assertEqual(max_label_value, 102)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py
deleted file mode 100644
index c4d82f268..000000000
--- a/python/paddle/v2/dataset/tests/imdb_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.imdb
-import unittest
-import re
-
-TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
-TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
-TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
-
-TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
-TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
-TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
-
-
-class TestIMDB(unittest.TestCase):
-    word_idx = None
-
-    def test_build_dict(self):
-        if self.word_idx == None:
-            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
-                                                              150)
-
-        self.assertEqual(len(self.word_idx), 7036)
-
-    def check_dataset(self, dataset, expected_size):
-        if self.word_idx == None:
-            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
-                                                              150)
-
-        sum = 0
-        for l in dataset(self.word_idx):
-            self.assertEqual(l[1], sum % 2)
-            sum += 1
-        self.assertEqual(sum, expected_size)
-
-    def test_train(self):
-        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
-
-    def test_test(self):
-        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
deleted file mode 100644
index 714a75d6f..000000000
--- a/python/paddle/v2/dataset/tests/imikolov_test.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.imikolov
-import unittest
-
-WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
-
-
-class TestMikolov(unittest.TestCase):
-    def check_reader(self, reader, n):
-        for l in reader():
-            self.assertEqual(len(l), n)
-
-    def test_train(self):
-        n = 5
-        self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
-
-        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
-            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
-            'rake regatta rubens sim snack-food ssangyong swapo wachter'
-        first_line = [
-            WORD_DICT.get(ch, WORD_DICT['<unk>'])
-            for ch in first_line.split(' ')
-        ]
-        for l in paddle.v2.dataset.imikolov.train(
-                WORD_DICT, n=-1,
-                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
-            read_line = l[0][1:]
-            break
-        self.assertEqual(first_line, read_line)
-
-    def test_test(self):
-        n = 5
-        self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
-
-        first_line = 'consumers may want to move their telephones a little '\
-                'closer to the tv set'
-        first_line = [
-            WORD_DICT.get(ch, WORD_DICT['<unk>'])
-            for ch in first_line.split(' ')
-        ]
-        for l in paddle.v2.dataset.imikolov.test(
-                WORD_DICT, n=-1,
-                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
-            read_line = l[0][1:]
-            break
-        self.assertEqual(first_line, read_line)
-
-    def test_total(self):
-        _, idx = zip(*WORD_DICT.items())
-        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py
deleted file mode 100644
index 1d344cac3..000000000
--- a/python/paddle/v2/dataset/tests/mnist_test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.mnist
-import unittest
-
-
-class TestMNIST(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        for l in reader():
-            self.assertEqual(l[0].size, 784)
-            if l[1] > label:
-                label = l[1]
-            sum += 1
-        return sum, label
-
-    def test_train(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.mnist.train())
-        self.assertEqual(instances, 60000)
-        self.assertEqual(max_label_value, 9)
-
-    def test_test(self):
-        instances, max_label_value = self.check_reader(
-            paddle.v2.dataset.mnist.test())
-        self.assertEqual(instances, 10000)
-        self.assertEqual(max_label_value, 9)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/v2/dataset/tests/mq2007_test.py
deleted file mode 100644
index 59847b6c1..000000000
--- a/python/paddle/v2/dataset/tests/mq2007_test.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.mq2007
-import unittest
-
-
-class TestMQ2007(unittest.TestCase):
-    def test_pairwise(self):
-        for label, query_left, query_right in paddle.v2.dataset.mq2007.test(
-                format="pairwise"):
-            self.assertEqual(query_left.shape(), (46, ))
-            self.assertEqual(query_right.shape(), (46, ))
-
-    def test_listwise(self):
-        for label_array, query_array in paddle.v2.dataset.mq2007.test(
-                format="listwise"):
-            self.assertEqual(len(label_array), len(query_array))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py
deleted file mode 100644
index 407405290..000000000
--- a/python/paddle/v2/dataset/tests/test_sentiment.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# /usr/bin/env python
-# -*- coding:utf-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import nltk
-import paddle.v2.dataset.sentiment as st
-from nltk.corpus import movie_reviews
-
-
-class TestSentimentMethods(unittest.TestCase):
-    def test_get_word_dict(self):
-        word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
-                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
-                          (u'is', 8), (u'in', 9)]
-        for idx, each in enumerate(word_dict):
-            self.assertEqual(each, test_word_list[idx])
-        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
-
-    def test_sort_files(self):
-        last_label = ''
-        for sample_file in st.sort_files():
-            current_label = sample_file.split("/")[0]
-            self.assertNotEqual(current_label, last_label)
-            last_label = current_label
-
-    def test_data_set(self):
-        data_set = st.load_sentiment_data()
-        last_label = -1
-        for each in st.test():
-            self.assertNotEqual(each[1], last_label)
-            last_label = each[1]
-        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
-        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
-        self.assertEqual(
-            len(list(st.test())),
-            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py
deleted file mode 100644
index 31e72ebf5..000000000
--- a/python/paddle/v2/dataset/tests/voc2012_test.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.voc2012
-import unittest
-
-
-class TestVOC(unittest.TestCase):
-    def check_reader(self, reader):
-        sum = 0
-        label = 0
-        for l in reader():
-            self.assertEqual(l[0].size, 3 * l[1].size)
-            sum += 1
-        return sum
-
-    def test_train(self):
-        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
-        self.assertEqual(count, 2913)
-
-    def test_test(self):
-        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
-        self.assertEqual(count, 1464)
-
-    def test_val(self):
-        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
-        self.assertEqual(count, 1449)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/v2/dataset/tests/wmt16_test.py
deleted file mode 100644
index cef6c3216..000000000
--- a/python/paddle/v2/dataset/tests/wmt16_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2.dataset.wmt16
-import unittest
-
-
-class TestWMT16(unittest.TestCase):
-    def checkout_one_sample(self, sample):
-        # train data has 3 field: source language word indices,
-        # target language word indices, and target next word indices.
-        self.assertEqual(len(sample), 3)
-
-        # test start mark and end mark in source word indices.
-        self.assertEqual(sample[0][0], 0)
-        self.assertEqual(sample[0][-1], 1)
-
-        # test start mask in target word indices
-        self.assertEqual(sample[1][0], 0)
-
-        # test en mask in target next word indices
-        self.assertEqual(sample[2][-1], 1)
-
-    def test_train(self):
-        for idx, sample in enumerate(
-                paddle.v2.dataset.wmt16.train(
-                    src_dict_size=100000, trg_dict_size=100000)()):
-            if idx >= 10: break
-            self.checkout_one_sample(sample)
-
-    def test_test(self):
-        for idx, sample in enumerate(
-                paddle.v2.dataset.wmt16.test(
-                    src_dict_size=1000, trg_dict_size=1000)()):
-            if idx >= 10: break
-            self.checkout_one_sample(sample)
-
-    def test_val(self):
-        for idx, sample in enumerate(
-                paddle.v2.dataset.wmt16.validation(
-                    src_dict_size=1000, trg_dict_size=1000)()):
-            if idx >= 10: break
-            self.checkout_one_sample(sample)
-
-    def test_get_dict(self):
-        dict_size = 1000
-        word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True)
-        self.assertEqual(len(word_dict), dict_size)
-        self.assertEqual(word_dict[0], "<s>")
-        self.assertEqual(word_dict[1], "<e>")
-        self.assertEqual(word_dict[2], "<unk>")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
deleted file mode 100644
index f10bf7e42..000000000
--- a/python/paddle/v2/dataset/uci_housing.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-UCI Housing dataset.
-
-This module will download dataset from
-https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
-parse training set and test set into paddle reader creators.
-"""
-
-import numpy as np
-import os
-import paddle.v2.dataset.common
-from paddle.v2.parameters import Parameters
-
-__all__ = ['train', 'test']
-
-URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
-MD5 = 'd4accdce7a25600298819f8e28e8d593'
-feature_names = [
-    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
-    'PTRATIO', 'B', 'LSTAT', 'convert'
-]
-
-UCI_TRAIN_DATA = None
-UCI_TEST_DATA = None
-URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
-MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
-
-
-def feature_range(maximums, minimums):
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt
-    fig, ax = plt.subplots()
-    feature_num = len(maximums)
-    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
-    ax.set_title('feature scale')
-    plt.xticks(range(feature_num), feature_names)
-    plt.xlim([-1, feature_num])
-    fig.set_figheight(6)
-    fig.set_figwidth(10)
-    if not os.path.exists('./image'):
-        os.makedirs('./image')
-    fig.savefig('image/ranges.png', dpi=48)
-    plt.close(fig)
-
-
-def load_data(filename, feature_num=14, ratio=0.8):
-    global UCI_TRAIN_DATA, UCI_TEST_DATA
-    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
-        return
-
-    data = np.fromfile(filename, sep=' ')
-    data = data.reshape(data.shape[0] / feature_num, feature_num)
-    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
-        axis=0) / data.shape[0]
-    feature_range(maximums[:-1], minimums[:-1])
-    for i in xrange(feature_num - 1):
-        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
-    offset = int(data.shape[0] * ratio)
-    UCI_TRAIN_DATA = data[:offset]
-    UCI_TEST_DATA = data[offset:]
-
-
-def train():
-    """
-    UCI_HOUSING training set creator.
-
-    It returns a reader creator, each sample in the reader is features after
-    normalization and price number.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    global UCI_TRAIN_DATA
-    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
-
-    def reader():
-        for d in UCI_TRAIN_DATA:
-            yield d[:-1], d[-1:]
-
-    return reader
-
-
-def test():
-    """
-    UCI_HOUSING test set creator.
-
-    It returns a reader creator, each sample in the reader is features after
-    normalization and price number.
-
-    :return: Test reader creator
-    :rtype: callable
-    """
-    global UCI_TEST_DATA
-    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
-
-    def reader():
-        for d in UCI_TEST_DATA:
-            yield d[:-1], d[-1:]
-
-    return reader
-
-
-def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
-                                                 MD5_MODEL)
-    with open(tar_file, 'r') as f:
-        parameters = Parameters.from_tar(f)
-    return parameters
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
-    paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py
deleted file mode 100644
index 617e212d6..000000000
--- a/python/paddle/v2/dataset/voc2012.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Image dataset for segmentation.
-The 2012 dataset contains images from 2008-2011 for which additional
-segmentations have been prepared. As in previous years the assignment
-to training/test sets has been maintained. The total number of images
-with segmentation has been increased from 7,062 to 9,993.
-"""
-
-import tarfile
-import io
-import numpy as np
-from paddle.v2.dataset.common import download
-from paddle.v2.image import *
-from PIL import Image
-
-__all__ = ['train', 'test', 'val']
-
-VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
-VOCtrainval_11-May-2012.tar'
-
-VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
-SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
-DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
-LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
-
-CACHE_DIR = 'voc2012'
-
-
-def reader_creator(filename, sub_name):
-
-    tarobject = tarfile.open(filename)
-    name2mem = {}
-    for ele in tarobject.getmembers():
-        name2mem[ele.name] = ele
-
-    def reader():
-        set_file = SET_FILE.format(sub_name)
-        sets = tarobject.extractfile(name2mem[set_file])
-        for line in sets:
-            line = line.strip()
-            data_file = DATA_FILE.format(line)
-            label_file = LABEL_FILE.format(line)
-            data = tarobject.extractfile(name2mem[data_file]).read()
-            label = tarobject.extractfile(name2mem[label_file]).read()
-            data = Image.open(io.BytesIO(data))
-            label = Image.open(io.BytesIO(label))
-            data = np.array(data)
-            label = np.array(label)
-            yield data, label
-
-    return reader
-
-
-def train():
-    """
-    Create a train dataset reader containing 2913 images in HWC order.
-    """
-    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
-
-
-def test():
-    """
-    Create a test dataset reader containing 1464 images in HWC order.
-    """
-    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
-
-
-def val():
-    """
-    Create a val dataset reader containing 1449 images in HWC order.
-    """
-    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
deleted file mode 100644
index b9e602f32..000000000
--- a/python/paddle/v2/dataset/wmt14.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-WMT14 dataset.
-The original WMT14 dataset is too large and a small set of data for set is
-provided. This module will download dataset from
-http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz and
-parse training set and test set into paddle reader creators.
-
-"""
-import tarfile
-import gzip
-
-import paddle.v2.dataset.common
-from paddle.v2.parameters import Parameters
-
-__all__ = [
-    'train',
-    'test',
-    'get_dict',
-    'convert',
-]
-
-URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
-                'cslm_joint_paper/data/dev+test.tgz')
-MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
-# this is a small set of data for test. The original data is too large and
-# will be add later.
-URL_TRAIN = ('http://paddlemodels.bj.bcebos.com/wmt/wmt14.tgz')
-MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
-# BLEU of this trained model is 26.92
-URL_MODEL = 'http://paddlemodels.bj.bcebos.com/wmt%2Fwmt14.tgz'
-MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
-
-START = "<s>"
-END = "<e>"
-UNK = "<unk>"
-UNK_IDX = 2
-
-
-def __read_to_dict(tar_file, dict_size):
-    def __to_dict(fd, size):
-        out_dict = dict()
-        for line_count, line in enumerate(fd):
-            if line_count < size:
-                out_dict[line.strip()] = line_count
-            else:
-                break
-        return out_dict
-
-    with tarfile.open(tar_file, mode='r') as f:
-        names = [
-            each_item.name for each_item in f
-            if each_item.name.endswith("src.dict")
-        ]
-        assert len(names) == 1
-        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
-        names = [
-            each_item.name for each_item in f
-            if each_item.name.endswith("trg.dict")
-        ]
-        assert len(names) == 1
-        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
-        return src_dict, trg_dict
-
-
-def reader_creator(tar_file, file_name, dict_size):
-    def reader():
-        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
-        with tarfile.open(tar_file, mode='r') as f:
-            names = [
-                each_item.name for each_item in f
-                if each_item.name.endswith(file_name)
-            ]
-            for name in names:
-                for line in f.extractfile(name):
-                    line_split = line.strip().split('\t')
-                    if len(line_split) != 2:
-                        continue
-                    src_seq = line_split[0]  # one source sequence
-                    src_words = src_seq.split()
-                    src_ids = [
-                        src_dict.get(w, UNK_IDX)
-                        for w in [START] + src_words + [END]
-                    ]
-
-                    trg_seq = line_split[1]  # one target sequence
-                    trg_words = trg_seq.split()
-                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
-
-                    # remove sequence whose length > 80 in training mode
-                    if len(src_ids) > 80 or len(trg_ids) > 80:
-                        continue
-                    trg_ids_next = trg_ids + [trg_dict[END]]
-                    trg_ids = [trg_dict[START]] + trg_ids
-
-                    yield src_ids, trg_ids, trg_ids_next
-
-    return reader
-
-
-def train(dict_size):
-    """
-    WMT14 training set creator.
-
-    It returns a reader creator, each sample in the reader is source language
-    word ID sequence, target language word ID sequence and next word ID
-    sequence.
-
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'train/train', dict_size)
-
-
-def test(dict_size):
-    """
-    WMT14 test set creator.
-
-    It returns a reader creator, each sample in the reader is source language
-    word ID sequence, target language word ID sequence and next word ID
-    sequence.
-
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'test/test', dict_size)
-
-
-def gen(dict_size):
-    return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'gen/gen', dict_size)
-
-
-def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
-    with gzip.open(tar_file, 'r') as f:
-        parameters = Parameters.from_tar(f)
-    return parameters
-
-
-def get_dict(dict_size, reverse=True):
-    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
-    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
-    tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
-    if reverse:
-        src_dict = {v: k for k, v in src_dict.items()}
-        trg_dict = {v: k for k, v in trg_dict.items()}
-    return src_dict, trg_dict
-
-
-def fetch():
-    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
-
-
-def convert(path):
-    """
-    Converts dataset to recordio format
-    """
-    dict_size = 30000
-    paddle.v2.dataset.common.convert(path,
-                                     train(dict_size), 1000, "wmt14_train")
-    paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py
deleted file mode 100644
index 579300209..000000000
--- a/python/paddle/v2/dataset/wmt16.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-ACL2016 Multimodal Machine Translation. Please see this website for more
-details: http://www.statmt.org/wmt16/multimodal-task.html#task1
-
-If you use the dataset created for your task, please cite the following paper:
-Multi30K: Multilingual English-German Image Descriptions.
-
-@article{elliott-EtAl:2016:VL16,
- author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
- title     = {Multi30K: Multilingual English-German Image Descriptions},
- booktitle = {Proceedings of the 6th Workshop on Vision and Language},
- year      = {2016},
- pages     = {70--74},
- year      = 2016
-}
-"""
-
-import os
-import tarfile
-import gzip
-from collections import defaultdict
-
-import paddle.v2.dataset.common
-
-__all__ = [
-    "train",
-    "test",
-    "validation",
-    "convert",
-    "fetch",
-    "get_dict",
-]
-
-DATA_URL = ("http://cloud.dlnel.org/filepub/"
-            "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
-DATA_MD5 = "0c38be43600334966403524a40dcd81e"
-
-TOTAL_EN_WORDS = 11250
-TOTAL_DE_WORDS = 19220
-
-START_MARK = "<s>"
-END_MARK = "<e>"
-UNK_MARK = "<unk>"
-
-
-def __build_dict(tar_file, dict_size, save_path, lang):
-    word_dict = defaultdict(int)
-    with tarfile.open(tar_file, mode="r") as f:
-        for line in f.extractfile("wmt16/train"):
-            line_split = line.strip().split("\t")
-            if len(line_split) != 2: continue
-            sen = line_split[0] if lang == "en" else line_split[1]
-            for w in sen.split():
-                word_dict[w] += 1
-
-    with open(save_path, "w") as fout:
-        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
-        for idx, word in enumerate(
-                sorted(
-                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
-            if idx + 3 == dict_size: break
-            fout.write(word[0].encode('utf-8'))
-            fout.write('\n')
-
-
-def __load_dict(tar_file, dict_size, lang, reverse=False):
-    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
-                             "wmt16/%s_%d.dict" % (lang, dict_size))
-    if not os.path.exists(dict_path) or (
-            len(open(dict_path, "r").readlines()) != dict_size):
-        __build_dict(tar_file, dict_size, dict_path, lang)
-
-    word_dict = {}
-    with open(dict_path, "r") as fdict:
-        for idx, line in enumerate(fdict):
-            if reverse:
-                word_dict[idx] = line.strip()
-            else:
-                word_dict[line.strip()] = idx
-    return word_dict
-
-
-def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
-    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
-                                        TOTAL_DE_WORDS))
-    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
-                                        TOTAL_ENG_WORDS))
-    return src_dict_size, trg_dict_size
-
-
-def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
-    def reader():
-        src_dict = __load_dict(tar_file, src_dict_size, src_lang)
-        trg_dict = __load_dict(tar_file, trg_dict_size,
-                               ("de" if src_lang == "en" else "en"))
-
-        # the indice for start mark, end mark, and unk are the same in source
-        # language and target language. Here uses the source language
-        # dictionary to determine their indices.
-        start_id = src_dict[START_MARK]
-        end_id = src_dict[END_MARK]
-        unk_id = src_dict[UNK_MARK]
-
-        src_col = 0 if src_lang == "en" else 1
-        trg_col = 1 - src_col
-
-        with tarfile.open(tar_file, mode="r") as f:
-            for line in f.extractfile(file_name):
-                line_split = line.strip().split("\t")
-                if len(line_split) != 2:
-                    continue
-                src_words = line_split[src_col].split()
-                src_ids = [start_id] + [
-                    src_dict.get(w, unk_id) for w in src_words
-                ] + [end_id]
-
-                trg_words = line_split[trg_col].split()
-                trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
-
-                trg_ids_next = trg_ids + [end_id]
-                trg_ids = [start_id] + trg_ids
-
-                yield src_ids, trg_ids, trg_ids_next
-
-    return reader
-
-
-def train(src_dict_size, trg_dict_size, src_lang="en"):
-    """
-    WMT16 train set reader.
-
-    This function returns the reader for train data. Each sample the reader
-    returns is made up of three fields: the source language word index sequence,
-    target language word index sequence and next word index sequence.
-
-
-    NOTE:
-    The original like for training data is:
-    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
-
-    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
-    using moses's tokenization script:
-    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
-
-    Args:
-        src_dict_size(int): Size of the source language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        trg_dict_size(int): Size of the target language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        src_lang(string): A string indicating which language is the source
-                          language. Available options are: "en" for English
-                          and "de" for Germany.
-
-    Returns:
-        callable: The train reader.
-    """
-
-    if src_lang not in ["en", "de"]:
-        raise ValueError("An error language type.  Only support: "
-                         "en (for English); de(for Germany).")
-    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
-                                                   src_lang)
-
-    return reader_creator(
-        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                   "wmt16.tar.gz"),
-        file_name="wmt16/train",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
-
-
-def test(src_dict_size, trg_dict_size, src_lang="en"):
-    """
-    WMT16 test set reader.
-
-    This function returns the reader for test data. Each sample the reader
-    returns is made up of three fields: the source language word index sequence,
-    target language word index sequence and next word index sequence.
-
-    NOTE:
-    The original like for test data is:
-    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
-
-    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
-    using moses's tokenization script:
-    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
-
-    Args:
-        src_dict_size(int): Size of the source language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        trg_dict_size(int): Size of the target language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        src_lang(string): A string indicating which language is the source
-                          language. Available options are: "en" for English
-                          and "de" for Germany.
-
-    Returns:
-        callable: The test reader.
-    """
-
-    if src_lang not in ["en", "de"]:
-        raise ValueError("An error language type. "
-                         "Only support: en (for English); de(for Germany).")
-
-    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
-                                                   src_lang)
-
-    return reader_creator(
-        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                   "wmt16.tar.gz"),
-        file_name="wmt16/test",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
-
-
-def validation(src_dict_size, trg_dict_size, src_lang="en"):
-    """
-    WMT16 validation set reader.
-
-    This function returns the reader for validation data. Each sample the reader
-    returns is made up of three fields: the source language word index sequence,
-    target language word index sequence and next word index sequence.
-
-    NOTE:
-    The original like for validation data is:
-    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
-
-    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
-    using moses's tokenization script:
-    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
-
-    Args:
-        src_dict_size(int): Size of the source language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        trg_dict_size(int): Size of the target language dictionary. Three
-                            special tokens will be added into the dictionary:
-                            <s> for start mark, <e> for end mark, and <unk> for
-                            unknown word.
-        src_lang(string): A string indicating which language is the source
-                          language. Available options are: "en" for English
-                          and "de" for Germany.
-
-    Returns:
-        callable: The validation reader.
-    """
-    if src_lang not in ["en", "de"]:
-        raise ValueError("An error language type. "
-                         "Only support: en (for English); de(for Germany).")
-    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
-                                                   src_lang)
-
-    return reader_creator(
-        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                                   "wmt16.tar.gz"),
-        file_name="wmt16/val",
-        src_dict_size=src_dict_size,
-        trg_dict_size=trg_dict_size,
-        src_lang=src_lang)
-
-
-def get_dict(lang, dict_size, reverse=False):
-    """
-    return the word dictionary for the specified language.
-
-    Args:
-        lang(string): A string indicating which language is the source
-                      language. Available options are: "en" for English
-                      and "de" for Germany.
-        dict_size(int): Size of the specified language dictionary.
-        reverse(bool): If reverse is set to False, the returned python
-                       dictionary will use word as key and use index as value.
-                       If reverse is set to True, the returned python
-                       dictionary will use index as key and word as value.
-
-    Returns:
-        dict: The word dictionary for the specific language.
-    """
-
-    if lang == "en":
-        dict_size = min(dict_size, TOTAL_EN_WORDS)
-    else:
-        dict_size = min(dict_size, TOTAL_DE_WORDS)
-
-    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
-                             "wmt16/%s_%d.dict" % (lang, dict_size))
-    assert os.path.exists(dict_path), "Word dictionary does not exist. "
-    "Please invoke paddle.dataset.wmt16.train/test/validation first "
-    "to build the dictionary."
-    tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz")
-    return __load_dict(tar_file, dict_size, lang, reverse)
-
-
-def fetch():
-    """download the entire dataset.
-    """
-    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
-                                      "wmt16.tar.gz")
-
-
-def convert(path, src_dict_size, trg_dict_size, src_lang):
-    """Converts dataset to recordio format.
-    """
-
-    paddle.v2.dataset.common.convert(
-        path,
-        train(
-            src_dict_size=src_dict_size,
-            trg_dict_size=trg_dict_size,
-            src_lang=src_lang),
-        1000,
-        "wmt16_train")
-    paddle.v2.dataset.common.convert(
-        path,
-        test(
-            src_dict_size=src_dict_size,
-            trg_dict_size=trg_dict_size,
-            src_lang=src_lang),
-        1000,
-        "wmt16_test")
-    paddle.v2.dataset.common.convert(
-        path,
-        validation(
-            src_dict_size=src_dict_size,
-            trg_dict_size=trg_dict_size,
-            src_lang=src_lang),
-        1000,
-        "wmt16_validation")
diff --git a/python/paddle/v2/evaluator.py b/python/paddle/v2/evaluator.py
deleted file mode 100644
index eaaadbe53..000000000
--- a/python/paddle/v2/evaluator.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.evaluators as evs
-from config_base import __convert_to_v2__
-import inspect
-
-__all__ = []
-
-
-def initialize():
-    def convert_to_new_name(nm):
-        return nm[:-len("_evaluator")]
-
-    for __ev_name__ in filter(lambda x: x.endswith('_evaluator'), evs.__all__):
-        __ev__ = getattr(evs, __ev_name__)
-        __new_name__ = convert_to_new_name(__ev_name__)
-
-        globals()[__new_name__] = __convert_to_v2__(__ev__, __new_name__,
-                                                    __name__)
-        globals()[__new_name__].__name__ = __new_name__
-        __all__.append(__new_name__)
-
-
-initialize()
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
deleted file mode 100644
index c11aa121c..000000000
--- a/python/paddle/v2/event.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Testing and training events.
-
-There are:
-
-* TestResult
-* BeginIteration
-* EndIteration
-* BeginPass
-* EndPass
-"""
-__all__ = [
-    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult',
-    'EndForwardBackward'
-]
-
-
-class WithMetric(object):
-    def __init__(self, evaluator):
-        import py_paddle.swig_paddle as api
-        if not isinstance(evaluator, api.Evaluator):
-            raise TypeError("Evaluator should be api.Evaluator type")
-        self.__evaluator__ = evaluator
-
-    @property
-    def metrics(self):
-        names = self.__evaluator__.getNames()
-        retv = dict()
-        for each_name in names:
-            val = self.__evaluator__.getValue(each_name)
-            retv[each_name] = val
-        return retv
-
-
-class TestResult(WithMetric):
-    """
-    Result that trainer.test return.
-    """
-
-    def __init__(self, evaluator, cost):
-        super(TestResult, self).__init__(evaluator)
-        self.cost = cost
-
-
-class BeginPass(object):
-    """
-    Event On One Pass Training Start.
-    """
-
-    def __init__(self, pass_id):
-        self.pass_id = pass_id
-
-
-class EndPass(WithMetric):
-    """
-    Event On One Pass Training Complete.
-    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
-    in your event_handler call back
-    """
-
-    def __init__(self, pass_id, evaluator, gm):
-        self.pass_id = pass_id
-        self.gm = gm
-        WithMetric.__init__(self, evaluator)
-
-
-class BeginIteration(object):
-    """
-    Event On One Batch Training Start.
-    """
-
-    def __init__(self, pass_id, batch_id):
-        self.pass_id = pass_id
-        self.batch_id = batch_id
-
-
-class EndForwardBackward(object):
-    """
-    Event On One Batch ForwardBackward Complete.
-    """
-
-    def __init__(self, pass_id, batch_id, gm):
-        self.pass_id = pass_id
-        self.batch_id = batch_id
-        self.gm = gm
-
-
-class EndIteration(WithMetric):
-    """
-    Event On One Batch Training Complete.
-    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
-    in your event_handler call back
-    """
-
-    def __init__(self, pass_id, batch_id, cost, evaluator, gm):
-        self.pass_id = pass_id
-        self.batch_id = batch_id
-        self.cost = cost
-        self.gm = gm
-        WithMetric.__init__(self, evaluator)
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
deleted file mode 100644
index 08d8bd68f..000000000
--- a/python/paddle/v2/image.py
+++ /dev/null
@@ -1,380 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This file contains some common interfaces for image preprocess.
-Many users are confused about the image layout. We introduce
-the image layout as follows.
-
-- CHW Layout
-
-  - The abbreviations: C=channel, H=Height, W=Width
-  - The default layout of image opened by cv2 or PIL is HWC.
-    PaddlePaddle only supports the CHW layout. And CHW is simply
-    a transpose of HWC. It must transpose the input image.
-
-- Color format: RGB or BGR
-
-  OpenCV use BGR color format. PIL use RGB color format. Both
-  formats can be used for training. Noted that, the format should
-  be keep consistent between the training and inference peroid.
-"""
-import numpy as np
-try:
-    import cv2
-except ImportError:
-    cv2 = None
-import os
-import tarfile
-import cPickle
-
-__all__ = [
-    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
-    "batch_images_from_tar"
-]
-
-
-def batch_images_from_tar(data_file,
-                          dataset_name,
-                          img2label,
-                          num_per_batch=1024):
-    """
-    Read images from tar file and batch them into batch file.
-
-    :param data_file: path of image tar file
-    :type data_file: string
-    :param dataset_name: 'train','test' or 'valid'
-    :type dataset_name: string
-    :param img2label: a dic with image file name as key 
-                    and image's label as value
-    :type img2label: dic
-    :param num_per_batch: image number per batch file
-    :type num_per_batch: int
-    :return: path of list file containing paths of batch file
-    :rtype: string
-    """
-    batch_dir = data_file + "_batch"
-    out_path = "%s/%s" % (batch_dir, dataset_name)
-    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
-
-    if os.path.exists(out_path):
-        return meta_file
-    else:
-        os.makedirs(out_path)
-
-    tf = tarfile.open(data_file)
-    mems = tf.getmembers()
-    data = []
-    labels = []
-    file_id = 0
-    for mem in mems:
-        if mem.name in img2label:
-            data.append(tf.extractfile(mem).read())
-            labels.append(img2label[mem.name])
-            if len(data) == num_per_batch:
-                output = {}
-                output['label'] = labels
-                output['data'] = data
-                cPickle.dump(
-                    output,
-                    open('%s/batch_%d' % (out_path, file_id), 'w'),
-                    protocol=cPickle.HIGHEST_PROTOCOL)
-                file_id += 1
-                data = []
-                labels = []
-    if len(data) > 0:
-        output = {}
-        output['label'] = labels
-        output['data'] = data
-        cPickle.dump(
-            output,
-            open('%s/batch_%d' % (out_path, file_id), 'w'),
-            protocol=cPickle.HIGHEST_PROTOCOL)
-
-    with open(meta_file, 'a') as meta:
-        for file in os.listdir(out_path):
-            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
-    return meta_file
-
-
-def load_image_bytes(bytes, is_color=True):
-    """
-    Load an color or gray image from bytes array.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        with open('cat.jpg') as f:
-            im = load_image_bytes(f.read())
-
-    :param bytes: the input image bytes array.
-    :type bytes: str
-    :param is_color: If set is_color True, it will load and
-                     return a color image. Otherwise, it will
-                     load and return a gray image.
-    :type is_color: bool
-    """
-    flag = 1 if is_color else 0
-    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
-    img = cv2.imdecode(file_bytes, flag)
-    return img
-
-
-def load_image(file, is_color=True):
-    """
-    Load an color or gray image from the file path.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = load_image('cat.jpg')
-
-    :param file: the input image path.
-    :type file: string
-    :param is_color: If set is_color True, it will load and
-                     return a color image. Otherwise, it will
-                     load and return a gray image.
-    :type is_color: bool
-    """
-    # cv2.IMAGE_COLOR for OpenCV3
-    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
-    # cv2.IMAGE_GRAYSCALE for OpenCV3
-    # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
-    # Here, use constant 1 and 0
-    # 1: COLOR, 0: GRAYSCALE
-    flag = 1 if is_color else 0
-    im = cv2.imread(file, flag)
-    return im
-
-
-def resize_short(im, size):
-    """ 
-    Resize an image so that the length of shorter edge is size.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = load_image('cat.jpg')
-        im = resize_short(im, 256)
-    
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param size: the shorter edge size of image after resizing.
-    :type size: int
-    """
-    h, w = im.shape[:2]
-    h_new, w_new = size, size
-    if h > w:
-        h_new = size * h / w
-    else:
-        w_new = size * w / h
-    im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_CUBIC)
-    return im
-
-
-def to_chw(im, order=(2, 0, 1)):
-    """
-    Transpose the input image order. The image layout is HWC format
-    opened by cv2 or PIL. Transpose the input image to CHW layout
-    according the order (2,0,1).
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = load_image('cat.jpg')
-        im = resize_short(im, 256)
-        im = to_chw(im)
-    
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param order: the transposed order.
-    :type order: tuple|list 
-    """
-    assert len(im.shape) == len(order)
-    im = im.transpose(order)
-    return im
-
-
-def center_crop(im, size, is_color=True):
-    """
-    Crop the center of image with size.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = center_crop(im, 224)
-    
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param size: the cropping size.
-    :type size: int
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    """
-    h, w = im.shape[:2]
-    h_start = (h - size) / 2
-    w_start = (w - size) / 2
-    h_end, w_end = h_start + size, w_start + size
-    if is_color:
-        im = im[h_start:h_end, w_start:w_end, :]
-    else:
-        im = im[h_start:h_end, w_start:w_end]
-    return im
-
-
-def random_crop(im, size, is_color=True):
-    """
-    Randomly crop input image with size.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = random_crop(im, 224)
-    
-    :param im: the input image with HWC layout.
-    :type im: ndarray
-    :param size: the cropping size.
-    :type size: int
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    """
-    h, w = im.shape[:2]
-    h_start = np.random.randint(0, h - size + 1)
-    w_start = np.random.randint(0, w - size + 1)
-    h_end, w_end = h_start + size, w_start + size
-    if is_color:
-        im = im[h_start:h_end, w_start:w_end, :]
-    else:
-        im = im[h_start:h_end, w_start:w_end]
-    return im
-
-
-def left_right_flip(im, is_color=True):
-    """
-    Flip an image along the horizontal direction.
-    Return the flipped image.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = left_right_flip(im)
-    
-    :param im: input image with HWC layout or HW layout for gray image
-    :type im: ndarray
-    :param is_color: whether input image is color or not
-    :type is_color: bool
-    """
-    if len(im.shape) == 3 and is_color:
-        return im[:, ::-1, :]
-    else:
-        return im[:, ::-1]
-
-
-def simple_transform(im,
-                     resize_size,
-                     crop_size,
-                     is_train,
-                     is_color=True,
-                     mean=None):
-    """
-    Simply data argumentation for training. These operations include
-    resizing, croping and flipping.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = simple_transform(im, 256, 224, True)
-
-    :param im: The input image with HWC layout.
-    :type im: ndarray
-    :param resize_size: The shorter edge length of the resized image.
-    :type resize_size: int
-    :param crop_size: The cropping size.
-    :type crop_size: int
-    :param is_train: Whether it is training or not.
-    :type is_train: bool
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
-                 mean values per channel.
-    :type mean: numpy array | list
-    """
-    im = resize_short(im, resize_size)
-    if is_train:
-        im = random_crop(im, crop_size, is_color=is_color)
-        if np.random.randint(2) == 0:
-            im = left_right_flip(im, is_color)
-    else:
-        im = center_crop(im, crop_size, is_color=is_color)
-    if len(im.shape) == 3:
-        im = to_chw(im)
-
-    im = im.astype('float32')
-    if mean is not None:
-        mean = np.array(mean, dtype=np.float32)
-        # mean value, may be one value per channel 
-        if mean.ndim == 1 and is_color:
-            mean = mean[:, np.newaxis, np.newaxis]
-        elif mean.ndim == 1:
-            mean = mean
-        else:
-            # elementwise mean
-            assert len(mean.shape) == len(im)
-        im -= mean
-
-    return im
-
-
-def load_and_transform(filename,
-                       resize_size,
-                       crop_size,
-                       is_train,
-                       is_color=True,
-                       mean=None):
-    """
-    Load image from the input file `filename` and transform image for
-    data argumentation. Please refer to the `simple_transform` interface
-    for the transform operations.
-
-    Example usage:
-    
-    .. code-block:: python
-
-        im = load_and_transform('cat.jpg', 256, 224, True)
-
-    :param filename: The file name of input image.
-    :type filename: string
-    :param resize_size: The shorter edge length of the resized image.
-    :type resize_size: int
-    :param crop_size: The cropping size.
-    :type crop_size: int
-    :param is_train: Whether it is training or not.
-    :type is_train: bool
-    :param is_color: whether the image is color or not.
-    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
-                 mean values per channel.
-    :type mean: numpy array | list
-    """
-    im = load_image(filename, is_color)
-    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
-    return im
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
deleted file mode 100644
index 28ee04228..000000000
--- a/python/paddle/v2/inference.py
+++ /dev/null
@@ -1,172 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy
-import collections
-import topology
-import paddle
-import cPickle
-
-__all__ = ['infer', 'Inference']
-
-
-class Inference(object):
-    """
-    Inference combines neural network output and parameters together
-    to do inference.
-
-    ..  code-block:: python
-
-        inferer = Inference(output_layer=prediction, parameters=parameters)
-        for data_batch in batches:
-            print inferer.infer(data_batch)
-
-
-    :param output_layer: The neural network that should be inferenced.
-    :type output_layer: paddle.v2.config_base.Layer or the sequence
-                        of paddle.v2.config_base.Layer
-    :param parameters: The parameters dictionary.
-    :type parameters: paddle.v2.parameters.Parameters
-    """
-
-    def __init__(self, parameters, output_layer=None, fileobj=None):
-        import py_paddle.swig_paddle as api
-
-        if output_layer is not None:
-            topo = topology.Topology(output_layer)
-            gm = api.GradientMachine.createFromConfigProto(
-                topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
-            self.__data_types__ = topo.data_type()
-        elif fileobj is not None:
-            tmp = cPickle.load(fileobj)
-            gm = api.GradientMachine.createByConfigProtoStr(
-                tmp['protobin'], api.CREATE_MODE_TESTING,
-                [api.PARAMETER_VALUE])
-            self.__data_types__ = tmp['data_type']
-        else:
-            raise ValueError("Either output_layer or fileobj must be set")
-
-        for param in gm.getParameters():
-            val = param.getBuf(api.PARAMETER_VALUE)
-            name = param.getName()
-            assert isinstance(val, api.Vector)
-            val.copyFromNumpyArray(parameters.get(name).flatten())
-            # the setValueUpdated function is called in randomize, zeroMem,
-            # load function in paddle/legacy/parameter/Parameter.cpp. But in the
-            # inference mode, the setValueUpdated is never called, it will
-            # cause the parameter will not be dispatched
-            # in MultiGradientMachine for multi-GPU. So setValueUpdated is
-            # called here, but it's better to call this function in one place.
-            param.setValueUpdated()
-        self.__gradient_machine__ = gm
-
-    def iter_infer(self, input, feeding=None):
-        from data_feeder import DataFeeder
-        feeder = DataFeeder(self.__data_types__, feeding)
-        batch_size = len(input)
-
-        def __reader_impl__():
-            for each_sample in input:
-                yield each_sample
-
-        reader = paddle.batch(__reader_impl__, batch_size=batch_size)
-
-        self.__gradient_machine__.start()
-        for data_batch in reader():
-            yield self.__gradient_machine__.forwardTest(feeder(data_batch))
-        self.__gradient_machine__.finish()
-
-    def iter_infer_field(self, field, **kwargs):
-        if not isinstance(field, list) and not isinstance(field, tuple):
-            field = [field]
-
-        for result in self.iter_infer(**kwargs):
-            for each_result in result:
-                item = [each_result[each_field] for each_field in field]
-                yield item
-
-    def infer(self, input, field='value', flatten_result=True, **kwargs):
-        """
-        Infer a data by model.
-        :param input: input data batch. Should be python iterable object.
-        :param field: output field.
-        """
-        retv = None
-        kwargs['input'] = input
-        for result in self.iter_infer_field(field=field, **kwargs):
-            if retv is None:
-                retv = [[] for i in xrange(len(result))]
-            for i, item in enumerate(result):
-                retv[i].append(item)
-
-        if retv == None:
-            return []
-
-        if flatten_result:
-            retv = [numpy.concatenate(out) for out in retv]
-
-        if len(retv) == 1:
-            return retv[0]
-        else:
-            return retv
-
-
-def infer(output_layer, parameters, input, feeding=None, field='value'):
-    """
-    Infer a neural network by given neural network output and parameters.  The
-    user should pass either a batch of input data or reader method.
-
-    Example usage for sinlge output_layer:
-
-    ..  code-block:: python
-
-        result = paddle.infer(output_layer=prediction,
-                              parameters=parameters,
-                              input=SomeData)
-        print result
-
-    Example usage for multiple outout_layers and fields:
-
-    ..  code-block:: python
-
-        result = paddle.infer(output_layer=[prediction1, prediction2],
-                              parameters=parameters,
-                              input=SomeData,
-                              field=[id, value]])
-        print result
-
-    :param output_layer: output of the neural network that would be inferred
-    :type output_layer: paddle.v2.config_base.Layer or a list of
-                        paddle.v2.config_base.Layer
-    :param parameters: parameters of the neural network.
-    :type parameters: paddle.v2.parameters.Parameters
-    :param input: input data batch. Should be a python iterable object, and each
-                  element is the data batch.
-    :type input: collections.Iterable
-    :param feeding: Reader dictionary. Default could generate from input
-                        value.
-    :param field: The prediction field. It should in [`value`, `id`, `prob`].
-                  `value` and `prob` mean return the prediction probabilities,
-                  `id` means return the prediction labels. Default is `value`.
-                  Note that `prob` only used when output_layer is beam_search
-                  or max_id.
-    :type field: str
-    :return: The prediction result. If there are multiple outout_layers and fields,
-             the return order is outout_layer1.field1, outout_layer2.field1, ...,
-             outout_layer1.field2, outout_layer2.field2 ...
-    :rtype: numpy.ndarray
-    """
-
-    inferer = Inference(output_layer=output_layer, parameters=parameters)
-    return inferer.infer(field=field, input=input, feeding=feeding)
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
deleted file mode 100644
index a188a03eb..000000000
--- a/python/paddle/v2/layer.py
+++ /dev/null
@@ -1,326 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-`paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2,
-we want to make Paddle a plain Python package. The model config package defines
-the way how to configure a neural network topology in Paddle Python code.
-
-The primary usage shows below.
-
-..  code-block:: python
-
-    import paddle
-
-    img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
-    hidden = paddle.layer.fc(input=img, size=200)
-    prediction = paddle.layer.fc(input=hidden, size=10,
-                                 act=paddle.activation.Softmax())
-
-    # use prediction instance where needed.
-    parameters = paddle.parameters.create(cost)
-"""
-import collections
-import copy
-import re
-import paddle.trainer_config_helpers.layers as v1_layers
-import paddle.trainer.config_parser as cp
-from paddle.proto.ModelConfig_pb2 import ModelConfig, SubModelConfig
-from config_base import __convert_to_v2__
-import config_base
-
-__all__ = ['data', 'parse_network']
-
-
-def __need_to_keep__(name):
-    return name in [
-        'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType',
-        'layer_support', 'BaseGeneratedInput'
-    ]
-
-
-def __need_to_wrap__(name):
-    return name not in ['AggregateLevel', 'ExpandLevel', 'BaseGeneratedInput']
-
-
-def __convert_name__(inname):
-    if __need_to_keep__(inname):
-        return inname
-    if inname == 'maxid_layer':
-        return 'max_id'
-    elif inname.endswith('memory') or inname.endswith(
-            '_seq') or inname.endswith('_sim') or inname == 'hsigmoid':
-        return inname
-    elif inname in [
-            'cross_entropy', 'multi_binary_label_cross_entropy',
-            'cross_entropy_with_selfnorm'
-    ]:
-        return inname + "_cost"
-    elif inname.endswith('_cost'):
-        return inname
-    elif inname.endswith("_layer"):
-        return inname[:-len("_layer")]
-    else:
-        return inname
-
-
-for name in v1_layers.__all__:
-    obj = getattr(v1_layers, name)
-    new_name = __convert_name__(name)
-    if callable(obj) and __need_to_wrap__(name):
-        globals()[new_name] = __convert_to_v2__(obj, new_name, __name__)
-    else:
-        globals()[new_name] = obj
-    __all__.append(new_name)
-
-
-def __data_layer__(name, type, **kwargs):
-    l = v1_layers.data_layer(name, type.dim, **kwargs)
-    l.data_type = type
-    return l
-
-
-def __map_data_docstr__(doc):
-    doc = re.sub(r'(data = [^\)]+)\).*',
-                 "data = paddle.layer.data(name=\"input\", "
-                 "type=paddle.data_type.dense_vector(1000))", doc)
-
-    doc = re.sub(r':param size:.*', ':param type: Data type of this data layer',
-                 doc)
-    doc = re.sub(r':type size:.*', ":type size: paddle.v2.data_type.InputType",
-                 doc)
-    return doc
-
-
-__data_layer__.__doc__ = __map_data_docstr__(v1_layers.data_layer.__doc__)
-
-data = __convert_to_v2__(__data_layer__, 'name', __name__)
-
-
-def __get_used_layers__(output_layers):
-    layer_names = set()
-    parents = {}
-
-    def add_parent(child, parent):
-        if child in parents:
-            parents[child].append(parent)
-        else:
-            parents[child] = [parent]
-
-    def add_additional_parents():
-        for sub_model in cp.g_config.model_config.sub_models:
-            if sub_model.name == 'root':
-                continue
-            for link in sub_model.in_links:
-                add_parent(link.link_name, link.layer_name)
-                add_parent(sub_model.name, link.layer_name)
-            for link in sub_model.out_links:
-                add_parent(link.link_name, link.layer_name)
-                add_parent(link.link_name, sub_model.name)
-            for mem in sub_model.memories:
-                if mem.boot_layer_name:
-                    add_parent(mem.layer_name, mem.boot_layer_name)
-                add_parent(mem.link_name, mem.layer_name)
-
-            if sub_model.HasField('generator'):
-                # according to the implementation of text generation
-                # in recurrent layer group, the generated word must be
-                # the first out link
-                add_parent(sub_model.out_links[0].layer_name,
-                           sub_model.generator.eos_layer_name)
-
-    def dfs_travel(layer_name):
-        if layer_name in layer_names:
-            return
-        layer_names.add(layer_name)
-        layer = cp.g_layer_map[layer_name]
-
-        for inp in layer.inputs:
-            dfs_travel(inp.input_layer_name)
-        if layer.name in parents:
-            for p in parents[layer.name]:
-                dfs_travel(p)
-
-    add_additional_parents()
-
-    for layer in output_layers:
-        dfs_travel(layer.full_name)
-
-    # print layer needs to be specially handled because no other
-    # layer depends on it. It is used to print the result of some
-    # layers when running the model for debug purpose. So we explicitly
-    # add a print layer to the topolty if its input is in the toplogy.
-    for layer in cp.g_config.model_config.layers:
-        if layer.type == 'print':
-            used = True
-            for inp in layer.inputs:
-                if inp.input_layer_name not in layer_names:
-                    used = False
-                    break
-            if used:
-                layer_names.add(layer.name)
-
-    return layer_names
-
-
-def __get_used_parameters__(layer_names, sub_models):
-    parameter_names = set()
-    for name in layer_names:
-        l = cp.g_layer_map[name]
-        for inp in l.inputs:
-            if inp.input_parameter_name:
-                parameter_names.add(inp.input_parameter_name)
-        if l.bias_parameter_name:
-            parameter_names.add(l.bias_parameter_name)
-
-    for sub_model in sub_models:
-        for mem in sub_model.memories:
-            if mem.HasField("boot_bias_parameter_name"):
-                parameter_names.add(mem.boot_bias_parameter_name)
-
-    return parameter_names
-
-
-def __get_used_submodels__(layer_names):
-    submodel_names = set()
-    for submodel in cp.g_config.model_config.sub_models:
-        if submodel.name in layer_names:
-            submodel_names.add(submodel.name)
-    return submodel_names
-
-
-def __get_submodel_data_out_links__():
-    data_links = set()
-    for submodel in cp.g_config.model_config.sub_models:
-        for link in submodel.out_links:
-            if cp.g_layer_map[link.link_name].type == 'data':
-                data_links.add(link.link_name)
-    return data_links
-
-
-def __get_used_evaluators__(layer_names):
-    evaluator_names = set()
-    for e in cp.g_config.model_config.evaluators:
-        used = True
-        for name in e.input_layers:
-            if name not in layer_names:
-                used = False
-                break
-        if used:
-            evaluator_names.add(e.name)
-    return evaluator_names
-
-
-def __trim_submodel__(old_submodel, layer_names, input_layer_names,
-                      output_layer_names, evaluator_names):
-
-    submodel = SubModelConfig()
-    submodel.name = old_submodel.name
-    submodel.layer_names.extend(
-        filter(lambda x: x in layer_names, old_submodel.layer_names))
-    submodel.input_layer_names.extend(
-        filter(lambda x: x in input_layer_names, submodel.layer_names))
-    submodel.output_layer_names.extend(
-        filter(lambda x: x in output_layer_names, submodel.layer_names))
-    submodel.evaluator_names.extend(
-        filter(lambda x: x in evaluator_names, old_submodel.evaluator_names))
-
-    submodel.is_recurrent_layer_group = old_submodel.is_recurrent_layer_group
-    submodel.reversed = old_submodel.reversed
-
-    submodel.memories.extend(
-        filter(lambda x: x.link_name in layer_names, old_submodel.memories))
-    target_inlinkid = (old_submodel.target_inlinkid
-                       if old_submodel.HasField('target_inlinkid') else -1)
-    in_links = []
-    for i, link in enumerate(old_submodel.in_links):
-        if link.link_name in layer_names or i == target_inlinkid:
-            in_links.append(link)
-            if i == target_inlinkid:
-                target_inlinkid = len(in_links) - 1
-    submodel.in_links.extend(in_links)
-
-    submodel.out_links.extend(
-        filter(lambda x: x.link_name in layer_names, old_submodel.out_links))
-    if old_submodel.HasField('generator'):
-        submodel.generator.CopyFrom(old_submodel.generator)
-
-    if old_submodel.HasField('target_inlinkid'):
-        submodel.target_inlinkid = target_inlinkid
-    return submodel
-
-
-def parse_network(output_layers, extra_layers=None):
-    if not isinstance(output_layers, collections.Sequence):
-        output_layers = [output_layers]
-    if extra_layers is not None:
-        if not isinstance(extra_layers, collections.Sequence):
-            extra_layers = [extra_layers]
-    else:
-        extra_layers = []
-
-    layer_names = __get_used_layers__(list(output_layers) + list(extra_layers))
-    submodel_names = __get_used_submodels__(layer_names)
-    submodel_names.add('root')
-    evaluator_names = __get_used_evaluators__(layer_names)
-    data_out_links = __get_submodel_data_out_links__()
-    input_layer_names = set()
-    output_layer_names = set()
-
-    model_config = ModelConfig()
-    model_config.type = cp.g_config.model_config.type
-
-    for layer in output_layers:
-        model_config.output_layer_names.append(layer.full_name)
-        output_layer_names.add(layer.full_name)
-
-    for l in cp.g_config.model_config.layers:
-        if l.name not in layer_names:
-            continue
-        model_config.layers.extend([l])
-        if l.type == 'data':
-            if l.name in data_out_links:
-                """
-                In text generation, the outlink to save the generated word
-                indices is a data_layer defined in recurrent_group. This
-                data_layer is sure to be the output of the network in text
-                generation task, so this statement excludes such a special
-                data_layer from being inputs of the network, otherwise an error
-                will occur during data feeding.
-                """
-                continue
-            model_config.input_layer_names.append(l.name)
-            input_layer_names.add(l.name)
-
-    for e in cp.g_config.model_config.evaluators:
-        if e.name in evaluator_names:
-            model_config.evaluators.extend([e])
-
-    for s in cp.g_config.model_config.sub_models:
-        if s.name in submodel_names:
-            s = __trim_submodel__(s, layer_names, input_layer_names,
-                                  output_layer_names, evaluator_names)
-            model_config.sub_models.extend([s])
-
-    parameter_names = __get_used_parameters__(layer_names,
-                                              model_config.sub_models)
-
-    for p in cp.g_config.model_config.parameters:
-        if p.name in parameter_names:
-            model_config.parameters.extend([p])
-
-    return model_config
-
-
-def get_layer(name):
-    return config_base.__layer_map__.get(name)
diff --git a/python/paddle/v2/master/.gitignore b/python/paddle/v2/master/.gitignore
deleted file mode 100644
index a3ac6e1a3..000000000
--- a/python/paddle/v2/master/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*.whl
-*.so
-*.pyc
diff --git a/python/paddle/v2/master/__init__.py b/python/paddle/v2/master/__init__.py
deleted file mode 100644
index efaeeabfa..000000000
--- a/python/paddle/v2/master/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from client import *
-
-__all__ = ['client']
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
deleted file mode 100644
index d62e7cc28..000000000
--- a/python/paddle/v2/master/client.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import ctypes
-import os
-
-__lib__ = None
-
-
-def get_c_lib():
-    global __lib__
-    if __lib__ is None:
-        path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
-        __lib__ = ctypes.cdll.LoadLibrary(path)
-    return __lib__
-
-
-class client(object):
-    """
-    client is a client to the master server.
-    """
-
-    def __init__(self, etcd_endpoints, timeout_sec, buf_size=0):
-        self.c = get_c_lib().paddle_new_etcd_master_client(
-            etcd_endpoints, timeout_sec, buf_size)
-
-    def request_save_model(self, trainer_id, block_ms):
-        """request to save model
-
-        Conventionally the 0-th trainer will save model. But in
-        distributed training, any trainer could be killed. This
-        function asks the master server if the trainer should proceed
-        with saving model.
-
-        :param trainer_id: trainer id.
-        :param block_ms: number of millisecond that other save model
-        will be blocked if this save model request succeeded.
-
-        Returns:
-            int: 1 if the save the model request is approved, 0 if
-            does the request is rejected because other trainer is
-            saving the model, -1 if error happened.
-
-        """
-        return get_c_lib().paddle_request_save_model(self.c, trainer_id,
-                                                     block_ms)
-
-    def release(self):
-        get_c_lib().paddle_release_master_client(self.c)
-        self.c = None
-
-    def set_dataset(self, paths):
-        holder_type = ctypes.c_char_p * len(paths)
-        holder = holder_type()
-        for idx, path in enumerate(paths):
-            c_ptr = ctypes.c_char_p(path)
-            holder[idx] = c_ptr
-        get_c_lib().paddle_set_dataset(self.c, holder, len(paths))
-
-    def next_record(self):
-        """gets next record for training
-
-        Returns:
-            string: the record.
-            int: error code, 0 if successful, < 0 otherwise.
-        """
-        p = ctypes.c_char_p()
-        ret = ctypes.pointer(p)
-        size = get_c_lib().paddle_next_record(self.c, ret)
-        if size < 0:
-            # Error
-            return None, size
-
-        if size == 0:
-            # Empty record
-            return "", 0
-
-        record = ret.contents.value[:size]
-        # Memory created from C should be freed.
-        get_c_lib().mem_free(ret.contents)
-        return record, 0
-
-    def paddle_start_get_records(self, pass_id):
-        get_c_lib().paddle_start_get_records(self.c, pass_id)
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
deleted file mode 100644
index 3c6a53db3..000000000
--- a/python/paddle/v2/minibatch.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['batch']
-
-
-def batch(reader, batch_size, drop_last=True):
-    """
-    Create a batched reader.
-
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param batch_size: size of each mini-batch
-    :type batch_size: int
-    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
-    :type drop_last: bool
-    :return: the batched reader.
-    :rtype: callable
-    """
-
-    def batch_reader():
-        r = reader()
-        b = []
-        for instance in r:
-            b.append(instance)
-            if len(b) == batch_size:
-                yield b
-                b = []
-        if drop_last == False and len(b) != 0:
-            yield b
-
-    return batch_reader
diff --git a/python/paddle/v2/networks.py b/python/paddle/v2/networks.py
deleted file mode 100644
index 8ae9f3b20..000000000
--- a/python/paddle/v2/networks.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.networks as conf_nw
-import inspect
-from config_base import __convert_to_v2__
-
-__all__ = []
-
-
-def __initialize__():
-    for each_subnetwork in conf_nw.__all__:
-        if each_subnetwork in ['inputs', 'outputs']:
-            continue
-        func = getattr(conf_nw, each_subnetwork)
-        globals()[each_subnetwork] = func
-        globals()[each_subnetwork].__name__ = each_subnetwork
-        global __all__
-        __all__.append(each_subnetwork)
-
-
-__initialize__()
diff --git a/python/paddle/v2/op.py b/python/paddle/v2/op.py
deleted file mode 100644
index 03f3b9b9e..000000000
--- a/python/paddle/v2/op.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import layer
-import activation as act
-from config_base import Layer
-from paddle.trainer_config_helpers.attrs import is_compatible_with
-from paddle.trainer_config_helpers.default_decorators import wrap_name_default
-
-__all__ = []
-
-
-def __register_unary_math_op__(op_name, act):
-    def op(input, name=None):
-        return layer.mixed(
-            input=[layer.identity_projection(input=input)], name=name, act=act)
-
-    op = wrap_name_default(op_name)(op)
-    op.__doc__ = type(act).__doc__
-    globals()[op_name] = op
-    __all__.append(op_name)
-
-
-__register_unary_math_op__('exp', act.Exp())
-__register_unary_math_op__('log', act.Log())
-__register_unary_math_op__('abs', act.Abs())
-__register_unary_math_op__('sigmoid', act.Sigmoid())
-__register_unary_math_op__('tanh', act.Tanh())
-__register_unary_math_op__('square', act.Square())
-__register_unary_math_op__('relu', act.Relu())
-__register_unary_math_op__('sqrt', act.Sqrt())
-__register_unary_math_op__('reciprocal', act.Reciprocal())
-__register_unary_math_op__('softmax', act.Softmax())
-
-
-def __add__(layeroutput, other):
-    if is_compatible_with(other, float):
-        return layer.slope_intercept(input=layeroutput, intercept=other)
-    if not isinstance(other, Layer):
-        raise TypeError("Layer can only be added with"
-                        " another Layer or a number")
-    if layeroutput.size == other.size:
-        return layer.mixed(input=[
-            layer.identity_projection(input=layeroutput),
-            layer.identity_projection(input=other)
-        ])
-    if other.size != 1 and layeroutput.size != 1:
-        raise TypeError("Two Layer can be added only if they have equal size"
-                        " or one of their sizes is 1. sizes are %s and %s" %
-                        (layeroutput.size, other.size))
-    elif layeroutput.size == 1:
-        tmp = layeroutput
-        layeroutput = other
-        other = tmp
-    other = layer.repeat(other, layeroutput.size)
-    return layer.mixed(input=[
-        layer.identity_projection(input=layeroutput),
-        layer.identity_projection(input=other)
-    ])
-
-
-Layer.__radd__ = __add__
-Layer.__add__ = __add__
-
-
-def __neg__(layeroutput):
-    return layer.slope_intercept(input=layeroutput, slope=-1.0)
-
-
-Layer.__neg__ = __neg__
-
-
-def __sub__(layeroutput, other):
-    if is_compatible_with(other, float):
-        return layer.slope_intercept(input=layeroutput, intercept=other)
-    if not isinstance(other, Layer):
-        raise TypeError("Layer can only be subtracted with"
-                        " another Layeroutput or a number")
-    return __add__(layeroutput, -other)
-
-
-Layer.__sub__ = __sub__
-
-
-def __rsub__(layeroutput, other):
-    neg = layer.slope_intercept(input=layeroutput, slope=-1.0)
-    return __add__(neg, other)
-
-
-Layer.__rsub__ = __rsub__
-
-
-def __mul__(layeroutput, other):
-    if is_compatible_with(other, float):
-        return layer.slope_intercept(input=layeroutput, slope=other)
-    if not isinstance(other, Layer):
-        raise TypeError("Layer can only be multiplied with"
-                        " another Layer or a number")
-    elif layeroutput.size == 1:
-        return layer.scaling(input=other, weight=layeroutput)
-    elif other.size == 1:
-        return layer.scaling(input=layeroutput, weight=other)
-    else:
-        raise TypeError("At least one of the operand of '*' must be a number"
-                        " or a Layer with size=1")
-
-
-Layer.__mul__ = __mul__
-Layer.__rmul__ = __mul__
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
deleted file mode 100644
index caef5f484..000000000
--- a/python/paddle/v2/optimizer.py
+++ /dev/null
@@ -1,297 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
-import paddle.trainer_config_helpers.optimizers as v1_optimizers
-from paddle.proto.OptimizerConfig_pb2 import OptimizerConfig
-
-__all__ = [
-    'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
-    'RMSProp', 'ModelAverage', 'L2Regularization'
-]
-
-
-class Optimizer(object):
-    def __init__(self, **kwargs):
-        import py_paddle.swig_paddle as swig_api
-        if 'batch_size' in kwargs:
-            del kwargs['batch_size']  # not important for python library.
-
-        def __impl__():
-            v1_optimizers.settings(batch_size=1, **kwargs)
-
-        self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
-            __impl__)
-        self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
-            self.__opt_conf_proto__)
-
-    def enable_types(self):
-        """
-        get enable_types for each optimizer.
-        enable_types = [value, gradient, momentum, etc]
-        For each optimizer(SGD, Adam), GradientMachine should enable different
-        buffers.
-        """
-        import py_paddle.swig_paddle as swig_api
-        tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
-        assert isinstance(tmp, swig_api.ParameterOptimizer)
-        return tmp.getParameterTypes()
-
-    def __create_local_updater__(self):
-        import py_paddle.swig_paddle as swig_api
-        return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
-
-    def __create_remote_updater__(self, pass_num, use_sparse_updater):
-        import py_paddle.swig_paddle as swig_api
-        return swig_api.ParameterUpdater.createRemoteUpdater(
-            self.__opt_conf__, pass_num, use_sparse_updater)
-
-    def __create_new_remote_updater__(self, pserver_spec, use_etcd):
-        import py_paddle.swig_paddle as swig_api
-        return swig_api.ParameterUpdater.createNewRemoteUpdater(
-            self.__opt_conf__, pserver_spec, use_etcd)
-
-    def create_updater(self, is_local, num_passes, use_sparse_updater,
-                       pserver_spec, use_etcd):
-        """
-        create proper parameter_updater by configuration.
-        :param is_local: create local or remote parameter updater
-        :param num_passes: remote parameter updater will use this to config
-        parameter server.
-        :param use_sparse_updater: when use remote updater, if some parameter is
-        sparse, updater should do some extra thing:
-
-        ..  code-block:: python
-
-            if use_sparse_remote_updater:
-                        gradient_machine.prefetch(in_args)
-                        parameter_updater.getParametersRemote()
-
-        :param pserver_spec: pserver location, eg: localhost:3000, if use etcd,
-        pserver_spec should be the etcd endpoints, eg: http://localhost:2379
-        :return: parameter_updater
-        """
-        if is_local:
-            parameter_updater = self.__create_local_updater__()
-        else:
-            if pserver_spec is None:
-                parameter_updater = self.__create_remote_updater__(
-                    num_passes, use_sparse_updater)
-            else:
-                parameter_updater = self.__create_new_remote_updater__(
-                    pserver_spec, use_etcd)
-        return parameter_updater
-
-
-class Momentum(Optimizer):
-    """
-    Momentum Optimizer.
-
-    When sparse=False, the momentum update formula is as follows:
-
-    ..  math::
-
-        v_{t} &= k * v_{t-1} - \\gamma_t (g_{t} + \\lambda w_{t-1}) \\\\
-        w_{t} &= w_{t-1} + v_{t} \\\\
-
-    where, :math:`k` is momentum, :math:`\\lambda` is decay rate,
-    :math:`\\gamma_t` is learning rate at the t'th iteration.
-    :math:`w_{t}` is the weight as the t'th iteration.
-    And the :math:`v_{t}` is the history momentum variable.
-
-    When sparse=True, the update scheme:
-
-    ..  math::
-
-        \\alpha_t &= \\alpha_{t-1} / k \\\\
-        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
-        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
-        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
-        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
-    
-    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
-    :math:`\\gamma_t` is learning rate at the t'th iteration.
-
-    :param momentum: the momentum factor.
-    :type momentum: float
-    :param sparse: with sparse support or not, False by default.
-    :type sparse: bool
-    """
-
-    def __init__(self, momentum=None, sparse=False, **kwargs):
-        learning_method = v1_optimizers.MomentumOptimizer(
-            momentum=momentum, sparse=sparse)
-        super(Momentum, self).__init__(
-            learning_method=learning_method, **kwargs)
-
-
-class Adam(Optimizer):
-    """
-    Adam optimizer.
-    The details of please refer `Adam: A Method for Stochastic Optimization
-    <https://arxiv.org/abs/1412.6980>`_
-
-    ..  math::
-
-        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
-        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
-
-    :param beta1: the :math:`\\beta_1` in equation.
-    :type beta1: float
-    :param beta2: the :math:`\\beta_2` in equation.
-    :type beta2: float
-    :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
-                        divided by zero.
-    :type epsilon: float
-    """
-
-    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
-        learning_method = v1_optimizers.AdamOptimizer(
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-        super(Adam, self).__init__(learning_method=learning_method, **kwargs)
-
-
-class Adamax(Optimizer):
-    """
-    Adamax optimizer.
-
-    The details of please refer this `Adam: A Method for Stochastic Optimization
-    <https://arxiv.org/abs/1412.6980>`_
-
-    ..  math::
-
-        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
-        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
-        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
-
-    :param beta1: the :math:`\\beta_1` in the equation.
-    :type beta1: float
-    :param beta2: the :math:`\\beta_2` in the equation.
-    :type beta2: float
-    """
-
-    def __init__(self, beta1=0.9, beta2=0.999, **kwargs):
-        learning_method = v1_optimizers.AdamaxOptimizer(
-            beta1=beta1, beta2=beta2)
-        super(Adamax, self).__init__(learning_method=learning_method, **kwargs)
-
-
-class AdaGrad(Optimizer):
-    """
-    Adagrad(for ADAptive GRAdient algorithm) optimizer.
-
-    For details please refer this `Adaptive Subgradient Methods for
-    Online Learning and Stochastic Optimization
-    <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.
-
-    ..  math::
-
-        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
-        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
-    """
-
-    def __init__(self, **kwargs):
-        learning_method = v1_optimizers.AdaGradOptimizer()
-        super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs)
-
-
-class DecayedAdaGrad(Optimizer):
-    """
-    AdaGrad method with decayed sum gradients. The equations of this method
-    show as follow.
-
-    ..  math::
-
-        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
-        learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )
-
-    :param rho: The :math:`\\rho` parameter in that equation
-    :type rho: float
-    :param epsilon: The :math:`\\epsilon` parameter in that equation.
-    :type epsilon: float
-    """
-
-    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
-        learning_method = v1_optimizers.DecayedAdaGradOptimizer(
-            rho=rho, epsilon=epsilon)
-        super(DecayedAdaGrad, self).__init__(
-            learning_method=learning_method, **kwargs)
-
-
-class AdaDelta(Optimizer):
-    """
-    AdaDelta method. The details of adadelta please refer to this
-    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
-    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
-
-    ..  math::
-
-        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
-        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
-                          E(g_t^2) + \\epsilon ) ) \\\\
-        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
-
-    :param rho: :math:`\\rho` in equation
-    :type rho: float
-    :param epsilon: :math:`\\rho` in equation
-    :type epsilon: float
-    """
-
-    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
-        learning_method = v1_optimizers.AdaDeltaOptimizer(
-            rho=rho, epsilon=epsilon)
-        super(AdaDelta, self).__init__(
-            learning_method=learning_method, **kwargs)
-
-
-class RMSProp(Optimizer):
-    """
-    RMSProp(for Root Mean Square Propagation) optimizer. For details please
-    refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
-    lecture_slides_lec6.pdf>`_.
-
-    The equations of this method as follows:
-
-    ..  math::
-
-        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
-        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
-
-    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
-    :type rho: float
-    :param epsilon: the :math:`\\epsilon` in the equation.
-    :type epsilon: float
-    """
-
-    def __init__(self, rho=0.95, epsilon=1e-6, **kwargs):
-        learning_method = v1_optimizers.RMSPropOptimizer(
-            rho=rho, epsilon=epsilon)
-        super(RMSProp, self).__init__(learning_method=learning_method, **kwargs)
-
-
-ModelAverage = v1_optimizers.ModelAverage
-L2Regularization = v1_optimizers.L2Regularization
-
-if __name__ == '__main__':
-    import py_paddle.swig_paddle as swig_api
-    swig_api.initPaddle('--use_gpu=false')
-    for opt in [
-            Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
-            AdaDelta(), RMSProp(), Adam(
-                model_average=ModelAverage(average_window=0.5),
-                regularization=L2Regularization(rate=0.5),
-                gradient_clipping_threshold=25)
-    ]:
-        print opt, opt.enable_types()
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
deleted file mode 100644
index 7b7d1a1d1..000000000
--- a/python/paddle/v2/parameters.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from paddle.proto.ParameterConfig_pb2 import ParameterConfig
-from collections import OrderedDict
-import paddle.trainer.config_parser as cp
-import struct
-import tarfile
-import cStringIO
-from topology import Topology
-
-__all__ = ['Parameters', 'create']
-
-
-def create(layers):
-    """
-    Create parameter pool by topology.
-
-    :param layers:
-    :return:
-    """
-    topology = Topology(layers)
-    pool = Parameters()
-    initializers = cp.g_parameter_initializer_map
-    for param in topology.proto().parameters:
-        pool.__append_config__(param)
-        if param.name in initializers:
-            pool[param.name] = initializers[param.name](param.name)
-    return pool
-
-
-class Parameters(object):
-    """
-    `Parameters` manages all the learnable parameters in a neural network.
-    It stores parameters' information in an OrderedDict. The key is
-    the name of a parameter, and value is a parameter's configuration(in
-    protobuf format), such as initialization mean and std, its size, whether it
-    is a static parameter, and so on.
-
-    :param __param_conf__: store the configurations of learnable parameters in
-        the network in an OrderedDict. Parameter is added one by one into the
-        dict by following their created order in the network: parameters of
-        the previous layers in a network are careted first. You can visit the
-        parameters from bottom to top by iterating over this dict.
-    :type __param_conf__: OrderedDict
-    :param __gradient_machines__: all of the parameters in a neural network are
-        appended to a PaddlePaddle gradient machine, which is used internally to
-        copy parameter values between C++ and Python end.
-    :type __gradient_machines__: list
-    :param __tmp_params__: a dict to store dummy parameters if no
-        __gradient_machines__ is appended to `Parameters`.
-    :type __tmp_params__: dict
-
-    Basically usage is
-
-    ..  code-block:: python
-
-        data = paddle.layers.data(...)
-        ...
-        out = paddle.layers.fc(...)
-
-        parameters = paddle.parameters.create(out)
-
-        parameter_names = parameters.names()
-        fc_mat = parameters.get('fc')
-        print fc_mat
-    """
-
-    def __init__(self):
-        self.__param_conf__ = OrderedDict()
-        self.__gradient_machines__ = []
-        self.__tmp_params__ = dict()
-
-    def __append_config__(self, param_conf):
-        """
-        Append a parameter configuration. It used to initialize Parameters and
-        should be invoked only in paddle.parameters.create
-
-        :param param_conf: The parameter configuration in protobuf
-        :type param_conf: ParameterConfig
-        :return: Nothing
-        """
-
-        if not isinstance(param_conf, ParameterConfig):
-            raise ValueError("param_conf must be paddle.proto.ParameterConfig")
-
-        if param_conf.name in self.__param_conf__:
-            raise ValueError("duplicated parameter %s" % param_conf.name)
-
-        self.__param_conf__[param_conf.name] = param_conf
-
-    def update_param_conf(self, model_config):
-        for p in model_config.parameters:
-            self.__param_conf__[p.name] = p
-
-    def keys(self):
-        """
-        keys are the names of each parameter.
-
-        :return: list of parameter name
-        :rtype: list
-        """
-        return self.__param_conf__.keys()
-
-    def names(self):
-        """
-        names of each parameter.
-
-        :return: list of parameter name
-        :rtype: list
-        """
-        return self.keys()
-
-    def has_key(self, key):
-        """
-        has_key return true if there are such parameter name == key
-
-        :param key: Parameter name
-        :type key: basestring
-        :return: True if contains such key
-        """
-        return key in self.__param_conf__.keys()
-
-    def __iter__(self):
-        """
-        Return an iterator of parameter name. It is used by `for loop`
-        or `in` operator.
-
-        ..  code-block:: python
-
-            parameters = paddle.parameters.create(...)
-            if "fc_param" in parameters:
-                print 'OK'
-        :return: an iterator of parameter name
-        :rtype: iterator
-        """
-        return iter(self.__param_conf__)
-
-    def __getter_inner(self, key, param_type):
-        import py_paddle.swig_paddle as api
-        shape = self.get_shape(key)
-
-        if len(self.__gradient_machines__) == 0:
-            # create new parameter in python numpy.
-            if key in self.__tmp_params__:
-                return self.__tmp_params__[key]
-            else:
-                return np.ndarray(shape=shape, dtype=np.float32)
-        else:
-            for each_gradient_machine in self.__gradient_machines__:
-                param = __get_parameter_in_gradient_machine__(
-                    each_gradient_machine, key)
-                # for simplify implementation now, we always copy from C++
-                assert isinstance(param, api.Parameter)
-                val = param.getBuf(param_type)
-                assert isinstance(val, api.Vector)
-                val = val.copyToNumpyArray()
-                return val
-                # else continue
-
-            raise RuntimeError("Unexpected branch")
-
-    def __getitem__(self, key):
-        """
-        Get parameter by parameter name. It uses Python dict syntax.
-
-        :note: It will always copy the parameter from C++ side.
-        :param key: Parameter name
-        :type key: basestring
-        :return: parameter value
-        :rtype: np.ndarray
-        """
-        import py_paddle.swig_paddle as api
-        return self.__getter_inner(key, api.PARAMETER_VALUE)
-
-    def get_shape(self, key):
-        """
-        get shape of the parameter.
-
-        :param key: parameter name
-        :type key: basestring
-        :return: parameter's shape
-        :rtype: tuple
-        """
-        if not isinstance(key, basestring):
-            raise ValueError("parameter name should be string")
-        if not self.has_key(key):
-            raise ValueError("No such parameter %s" % key)
-        conf = self.__param_conf__[key]
-        dims = conf.dims if conf.dims else (1, conf.size)
-        return tuple(map(int, dims))
-
-    def __setitem__(self, key, value):
-        """
-        Set parameter by parameter name & value. It use Python dict syntax.
-
-        :note: It will always copy the parameter to C++ side.
-        :param key: Parameter name
-        :type key: basestring
-        :param value: Parameter matrix.
-        :type value: np.ndarray
-        :return: Nothing
-        """
-
-        if not isinstance(value, np.ndarray):
-            raise ValueError("Must return ndarray")
-        value = value.astype(dtype=np.float32)
-        shape = self.get_shape(key)
-        if value.shape != shape:
-            raise ValueError("Value shape mismatch, expect %s, should %s" %
-                             (shape, value.shape))
-
-        if len(self.__gradient_machines__) == 0:
-            self.__tmp_params__[key] = value
-        else:
-            for each_gradient_machine in self.__gradient_machines__:
-                __copy_parameter_to_gradient_machine__(each_gradient_machine,
-                                                       key, value)
-
-    def get(self, parameter_name):
-        """
-        Get parameter by parameter name.
-
-        :note: It will always copy the parameter from C++ side.
-        :param parameter_name: parameter name
-        :type parameter_name: basestring
-        :return: The parameter matrix.
-        :rtype: np.ndarray
-        """
-        return self.__getitem__(key=parameter_name)
-
-    def get_grad(self, key):
-        """
-        Get grandient by parameter name.
-
-        :note: It will always copy the parameter from C++ side.
-        :param key: parameter name
-        :type key: basestring
-        :return: The grandient matrix.
-        :rtype: np.ndarray
-        """
-        import py_paddle.swig_paddle as api
-        if self.__param_conf__[key].is_static:
-            return np.zeros(self.__param_conf__[key].size, dtype=np.float32)
-
-        return self.__getter_inner(key, api.PARAMETER_GRADIENT)
-
-    def set(self, parameter_name, value):
-        """
-        Set parameter by parameter name & matrix.
-
-        :param parameter_name: parameter name
-        :type parameter_name: basestring
-        :param value: parameter matrix
-        :type value: np.ndarray
-        :return: Nothing.
-        """
-        self.__setitem__(key=parameter_name, value=value)
-
-    def append_gradient_machine(self, gradient_machine):
-        """
-        append gradient machine to parameters. This method is used internally in
-        Trainer.train.
-
-        :param gradient_machine: PaddlePaddle C++ GradientMachine object.
-        :type gradient_machine: api.GradientMachine
-        :return:
-        """
-        import py_paddle.swig_paddle as api
-        if not isinstance(gradient_machine, api.GradientMachine):
-            raise ValueError("gradient_machine should be api.GradientMachine")
-
-        if len(self.__tmp_params__) != 0:
-            for name, val in self.__tmp_params__.iteritems():
-                try:
-                    __copy_parameter_to_gradient_machine__(gradient_machine,
-                                                           name, val)
-                except ValueError:
-                    # If no such parameter in gradient machine, then don't copy
-                    pass
-
-        self.__gradient_machines__.append(gradient_machine)
-
-    def serialize(self, name, f):
-        """
-
-        :param name:
-        :param f:
-        :type f: file
-        :return:
-        """
-        param = self.get(name)
-        size = reduce(lambda a, b: a * b, param.shape)
-        f.write(struct.pack("IIQ", 0, 4, size))
-        param = param.astype(np.float32)
-        s = param.tostring()
-        wrote_size = 0
-        buf = buffer(s, wrote_size, 65535)
-        while buf:  # f.write crashes with big data blog.
-            f.write(buf)
-            wrote_size += 65535
-            buf = buffer(s, wrote_size, 65535)
-
-    def deserialize(self, name, f):
-        """
-
-        :param name:
-        :param f:
-        :type f: file
-        :return:
-        """
-        f.read(16)  # header
-        arr = np.frombuffer(f.read(), dtype=np.float32)
-        self.set(name, arr.reshape(self.get_shape(name)))
-
-    def to_tar(self, f):
-        """
-        Save parameters to a tar file.
-
-        WARNING: You should use `paddle.v2.trainer.SGD.save_parameter_to_tar(f)`
-            to save parameters most of the time. Otherwise, some settings such
-            as model average will not take effect.
-
-        :param f:
-        :type f: file
-        :return:
-        """
-        tar = tarfile.TarFile(fileobj=f, mode='w')
-        for nm in self.names():
-            buf = cStringIO.StringIO()
-            self.serialize(nm, buf)
-            tarinfo = tarfile.TarInfo(name=nm)
-            buf.seek(0)
-            tarinfo.size = len(buf.getvalue())
-            tar.addfile(tarinfo, buf)
-
-            conf = self.__param_conf__[nm]
-            confStr = conf.SerializeToString()
-            tarinfo = tarfile.TarInfo(name="%s.protobuf" % nm)
-            tarinfo.size = len(confStr)
-            buf = cStringIO.StringIO(confStr)
-            buf.seek(0)
-            tar.addfile(tarinfo, fileobj=buf)
-
-    @staticmethod
-    def from_tar(f):
-        """
-        Create a `Parameters` object from the given file. And
-        the `Parameters` only contains the parameters in this
-        file. It is adapted the parameters are same in the
-        defined network and the given file. For example, it
-        can be used in the inference.
-
-        :param f: the initialized model file.
-        :type f: tar file
-        :return: A Parameters object.
-        :rtype: Parameters.
-        """
-        params = Parameters()
-        tar = tarfile.TarFile(fileobj=f, mode='r')
-        for finfo in tar:
-            assert isinstance(finfo, tarfile.TarInfo)
-            if finfo.name.endswith('.protobuf'):
-                f = tar.extractfile(finfo)
-                conf = ParameterConfig()
-                conf.ParseFromString(f.read())
-                params.__append_config__(conf)
-
-        for param_name in params.names():
-            f = tar.extractfile(param_name)
-            params.deserialize(param_name, f)
-        return params
-
-    def init_from_tar(self, f, exclude_params=[]):
-        """
-        Different from `from_tar`, this interface can be used to
-        init partial network parameters from another saved model.
-
-        :param f: the initialized model file.
-        :type f: tar file
-        :param exclude_params: the names of parameters that should  
-            not be initialized from the model file.
-        :type exclude_params: list of strings
-        :return: Nothing.
-        """
-
-        tar_param = Parameters.from_tar(f)
-        for pname in tar_param.names():
-            if pname in self.names() and pname not in exclude_params:
-                self.set(pname, tar_param.get(pname))
-
-
-def __get_parameter_in_gradient_machine__(gradient_machine, name):
-    """
-
-    :param gradient_machine:
-    :type gradient_machine: api.GradientMachine
-    :param name:
-    :return:
-    :rtype: api.Parameter
-    """
-    params = filter(lambda p: p.getName() == name,
-                    gradient_machine.getParameters())
-
-    if len(params) == 0:
-        raise ValueError("No such parameter")
-    elif len(params) > 1:
-        raise ValueError("Unexpected branch")
-    else:
-        return params[0]
-
-
-def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
-    """
-    Copy a python ndarray into the gradient machine.
-
-    :param gradient_machine:
-    :type gradient_machine: api.GradientMachine
-    :param name:
-    :param arr:
-    :type arr: np.ndarray
-    :return:
-    :rtype: api.Parameter
-    """
-    import py_paddle.swig_paddle as api
-    param = __get_parameter_in_gradient_machine__(gradient_machine, name)
-    vec = param.getBuf(api.PARAMETER_VALUE)
-    assert isinstance(vec, api.Vector)
-    vec.copyFromNumpyArray(arr.flatten())
diff --git a/python/paddle/v2/plot/__init__.py b/python/paddle/v2/plot/__init__.py
deleted file mode 100644
index acd3013db..000000000
--- a/python/paddle/v2/plot/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from plot import Ploter
-
-__all__ = ['Ploter']
diff --git a/python/paddle/v2/plot/plot.py b/python/paddle/v2/plot/plot.py
deleted file mode 100644
index c18e63dd5..000000000
--- a/python/paddle/v2/plot/plot.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-
-class PlotData(object):
-    def __init__(self):
-        self.step = []
-        self.value = []
-
-    def append(self, step, value):
-        self.step.append(step)
-        self.value.append(value)
-
-    def reset(self):
-        self.step = []
-        self.value = []
-
-
-class Ploter(object):
-    def __init__(self, *args):
-        self.__args__ = args
-        self.__plot_data__ = {}
-        for title in args:
-            self.__plot_data__[title] = PlotData()
-        # demo in notebooks will use Ploter to plot figure, but when we convert
-        # the ipydb to py file for testing, the import of matplotlib will make the
-        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
-        # these libs
-        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
-        if not self.__plot_is_disabled__():
-            import matplotlib.pyplot as plt
-            from IPython import display
-            self.plt = plt
-            self.display = display
-
-    def __plot_is_disabled__(self):
-        return self.__disable_plot__ == "True"
-
-    def append(self, title, step, value):
-        assert isinstance(title, basestring)
-        assert self.__plot_data__.has_key(title)
-        data = self.__plot_data__[title]
-        assert isinstance(data, PlotData)
-        data.append(step, value)
-
-    def plot(self, path=None):
-        if self.__plot_is_disabled__():
-            return
-
-        titles = []
-        for title in self.__args__:
-            data = self.__plot_data__[title]
-            assert isinstance(data, PlotData)
-            if len(data.step) > 0:
-                titles.append(title)
-                self.plt.plot(data.step, data.value)
-        self.plt.legend(titles, loc='upper left')
-        if path is None:
-            self.display.clear_output(wait=True)
-            self.display.display(self.plt.gcf())
-        else:
-            self.plt.savefig(path)
-        self.plt.gcf().clear()
-
-    def reset(self):
-        for key in self.__plot_data__:
-            data = self.__plot_data__[key]
-            assert isinstance(data, PlotData)
-            data.reset()
diff --git a/python/paddle/v2/plot/tests/CMakeLists.txt b/python/paddle/v2/plot/tests/CMakeLists.txt
deleted file mode 100644
index 4b6c1c809..000000000
--- a/python/paddle/v2/plot/tests/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-if (NOT APPLE)
-  # The Mac OS X backend will not be able to function correctly if Python is
-  # not installed as a framework.
-  py_test(test_ploter SRCS test_ploter.py)
-endif()
diff --git a/python/paddle/v2/plot/tests/__init__.py b/python/paddle/v2/plot/tests/__init__.py
deleted file mode 100644
index d1abfc08f..000000000
--- a/python/paddle/v2/plot/tests/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import test_ploter
-
-__all__ = ['test_ploter.py']
diff --git a/python/paddle/v2/plot/tests/test_ploter.py b/python/paddle/v2/plot/tests/test_ploter.py
deleted file mode 100644
index a75f853ed..000000000
--- a/python/paddle/v2/plot/tests/test_ploter.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from paddle.v2.plot import Ploter
-
-
-class TestCommon(unittest.TestCase):
-    def test_append(self):
-        title1 = "title1"
-        title2 = "title2"
-        plot_test = Ploter(title1, title2)
-        plot_test.append(title1, 1, 2)
-        plot_test.append(title1, 2, 5)
-        plot_test.append(title2, 3, 4)
-        self.assertEqual(plot_test.__plot_data__[title1].step, [1, 2])
-        self.assertEqual(plot_test.__plot_data__[title1].value, [2, 5])
-        self.assertEqual(plot_test.__plot_data__[title2].step, [3])
-        self.assertEqual(plot_test.__plot_data__[title2].value, [4])
-        plot_test.reset()
-        self.assertEqual(plot_test.__plot_data__[title1].step, [])
-        self.assertEqual(plot_test.__plot_data__[title1].value, [])
-        self.assertEqual(plot_test.__plot_data__[title2].step, [])
-        self.assertEqual(plot_test.__plot_data__[title2].value, [])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/pooling.py b/python/paddle/v2/pooling.py
deleted file mode 100644
index 4881c27d1..000000000
--- a/python/paddle/v2/pooling.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.trainer_config_helpers.poolings
-import copy
-
-__all__ = []
-suffix = 'Pooling'
-
-for name in paddle.trainer_config_helpers.poolings.__all__:
-    new_name = name[:-len(suffix)]
-    globals()[new_name] = copy.copy(
-        getattr(paddle.trainer_config_helpers.poolings, name))
-    globals()[new_name].__name__ = new_name
-    __all__.append(new_name)
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
deleted file mode 100644
index 12efdc4a0..000000000
--- a/python/paddle/v2/reader/__init__.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-At training and testing time, PaddlePaddle programs need to read data. To ease
-the users' work to write data reading code, we define that
-
-- A *reader* is a function that reads data (from file, network, random number
-  generator, etc) and yields data items.
-- A *reader creator* is a function that returns a reader function.
-- A *reader decorator* is a function, which accepts one or more readers, and
-  returns a reader.
-- A *batch reader* is a function that reads data (from *reader*, file, network,
-  random number generator, etc) and yields a batch of data items.
-
-#####################
-Data Reader Interface
-#####################
-
-Indeed, *data reader* doesn't have to be a function that reads and yields data
-items. It can be any function with no parameter that creates a iterable
-(anything can be used in :code:`for x in iterable`)\:
-
-..  code-block:: python
-
-    iterable = data_reader()
-
-Element produced from the iterable should be a **single** entry of data,
-**not** a mini batch. That entry of data could be a single item, or a tuple of
-items.
-Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
-/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
-array of float32, int, list of int)
-
-An example implementation for single item data reader creator:
-
-..  code-block:: python
-
-    def reader_creator_random_image(width, height):
-        def reader():
-            while True:
-                yield numpy.random.uniform(-1, 1, size=width*height)
-        return reader
-
-An example implementation for multiple item data reader creator:
-
-..  code-block:: python
-
-    def reader_creator_random_image_and_label(width, height, label):
-        def reader():
-            while True:
-                yield numpy.random.uniform(-1, 1, size=width*height), label
-        return reader
-
-
-TODO(yuyang18): Should we add whole design doc here?
-"""
-
-import decorator
-from decorator import *
-
-import creator
-
-__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
deleted file mode 100644
index fda5246d7..000000000
--- a/python/paddle/v2/reader/creator.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Creator package contains some simple reader creator, which could
-be used in user program.
-"""
-
-__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
-
-
-def np_array(x):
-    """
-    Creates a reader that yields elements of x, if it is a
-    numpy vector. Or rows of x, if it is a numpy matrix.
-    Or any sub-hyperplane indexed by the highest dimension.
-
-    :param x: the numpy array to create reader from.
-    :returns: data reader created from x.
-    """
-
-    def reader():
-        if x.ndim < 1:
-            yield x
-
-        for e in x:
-            yield e
-
-    return reader
-
-
-def text_file(path):
-    """
-    Creates a data reader that outputs text line by line from given text file.
-    Trailing new line ('\\\\n') of each line will be removed.
-
-    :path: path of the text file.
-    :returns: data reader of text file
-    """
-
-    def reader():
-        f = open(path, "r")
-        for l in f:
-            yield l.rstrip('\n')
-        f.close()
-
-    return reader
-
-
-def recordio(paths, buf_size=100):
-    """
-    Creates a data reader from given RecordIO file paths separated by ",",
-        glob pattern is supported.
-    :path: path of recordio files, can be a string or a string list.
-    :returns: data reader of recordio files.
-    """
-
-    import recordio as rec
-    import paddle.v2.reader.decorator as dec
-    import cPickle as pickle
-
-    def reader():
-        if isinstance(paths, basestring):
-            path = paths
-        else:
-            path = ",".join(paths)
-        f = rec.reader(path)
-        while True:
-            r = f.read()
-            if r is None:
-                break
-            yield pickle.loads(r)
-        f.close()
-
-    return dec.buffered(reader, buf_size)
-
-
-pass_num = 0
-
-
-def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
-    """
-    Create a data reader that yield a record one by one from
-        the paths:
-    :paths: path of recordio files, can be a string or a string list.
-    :etcd_endpoints: the endpoints for etcd cluster
-    :returns: data reader of recordio files.
-
-    ..  code-block:: python
-        from paddle.v2.reader.creator import cloud_reader
-        etcd_endpoints = "http://127.0.0.1:2379"
-        trainer.train.(
-            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
-        )
-    """
-    import os
-    import cPickle as pickle
-    import paddle.v2.master as master
-    c = master.client(etcd_endpoints, timeout_sec, buf_size)
-
-    if isinstance(paths, basestring):
-        path = [paths]
-    else:
-        path = paths
-    c.set_dataset(path)
-
-    def reader():
-        global pass_num
-        c.paddle_start_get_records(pass_num)
-        pass_num += 1
-
-        while True:
-            r, e = c.next_record()
-            if not r:
-                if e != -2:
-                    print "get record error: ", e
-                break
-            yield pickle.loads(r)
-
-    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
deleted file mode 100644
index 44a6e3446..000000000
--- a/python/paddle/v2/reader/decorator.py
+++ /dev/null
@@ -1,405 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = [
-    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
-]
-
-from threading import Thread
-import subprocess
-
-from Queue import Queue
-import itertools
-import random
-import zlib
-
-
-def map_readers(func, *readers):
-    """
-    Creates a data reader that outputs return value of function using
-    output of each data readers as arguments.
-
-    :param func: function to use. The type of func should be (Sample) => Sample
-    :type: callable
-    :param readers: readers whose outputs will be used as arguments of func.
-    :return: the created data reader.
-    :rtype: callable
-    """
-
-    def reader():
-        rs = []
-        for r in readers:
-            rs.append(r())
-        for e in itertools.imap(func, *rs):
-            yield e
-
-    return reader
-
-
-def shuffle(reader, buf_size):
-    """
-    Creates a data reader whose data output is shuffled.
-
-    Output from the iterator that created by original reader will be
-    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
-    is determined by argument buf_size.
-
-    :param reader: the original reader whose output will be shuffled.
-    :type reader: callable
-    :param buf_size: shuffle buffer size.
-    :type buf_size: int
-
-    :return: the new reader whose output is shuffled.
-    :rtype: callable
-    """
-
-    def data_reader():
-        buf = []
-        for e in reader():
-            buf.append(e)
-            if len(buf) >= buf_size:
-                random.shuffle(buf)
-                for b in buf:
-                    yield b
-                buf = []
-
-        if len(buf) > 0:
-            random.shuffle(buf)
-            for b in buf:
-                yield b
-
-    return data_reader
-
-
-def chain(*readers):
-    """
-    Creates a data reader whose output is the outputs of input data
-    readers chained together.
-
-    If input readers output following data entries:
-    [0, 0, 0]
-    [1, 1, 1]
-    [2, 2, 2]
-    The chained reader will output:
-    [0, 0, 0, 1, 1, 1, 2, 2, 2]
-
-    :param readers: input readers.
-    :return: the new data reader.
-    :rtype: callable
-    """
-
-    def reader():
-        rs = []
-        for r in readers:
-            rs.append(r())
-
-        for e in itertools.chain(*rs):
-            yield e
-
-    return reader
-
-
-class ComposeNotAligned(ValueError):
-    pass
-
-
-def compose(*readers, **kwargs):
-    """
-    Creates a data reader whose output is the combination of input readers.
-
-    If input readers output following data entries:
-    (1, 2)    3    (4, 5)
-    The composed reader will output:
-    (1, 2, 3, 4, 5)
-
-    :param readers: readers that will be composed together.
-    :param check_alignment: if True, will check if input readers are aligned
-        correctly. If False, will not check alignment and trailing outputs
-        will be discarded. Defaults to True.
-    :type check_alignment: bool
-
-    :return: the new data reader.
-
-    :raises ComposeNotAligned: outputs of readers are not aligned.
-        Will not raise when check_alignment is set to False.
-    """
-    check_alignment = kwargs.pop('check_alignment', True)
-
-    def make_tuple(x):
-        if isinstance(x, tuple):
-            return x
-        else:
-            return (x, )
-
-    def reader():
-        rs = []
-        for r in readers:
-            rs.append(r())
-        if not check_alignment:
-            for outputs in itertools.izip(*rs):
-                yield sum(map(make_tuple, outputs), ())
-        else:
-            for outputs in itertools.izip_longest(*rs):
-                for o in outputs:
-                    if o is None:
-                        # None will be not be present if compose is aligned
-                        raise ComposeNotAligned(
-                            "outputs of readers are not aligned.")
-                yield sum(map(make_tuple, outputs), ())
-
-    return reader
-
-
-def buffered(reader, size):
-    """
-    Creates a buffered data reader.
-
-    The buffered data reader will read and save data entries into a
-    buffer. Reading from the buffered data reader will proceed as long
-    as the buffer is not empty.
-
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param size: max buffer size.
-    :type size: int
-
-    :returns: the buffered data reader.
-    """
-
-    class EndSignal():
-        pass
-
-    end = EndSignal()
-
-    def read_worker(r, q):
-        for d in r:
-            q.put(d)
-        q.put(end)
-
-    def data_reader():
-        r = reader()
-        q = Queue(maxsize=size)
-        t = Thread(
-            target=read_worker, args=(
-                r,
-                q, ))
-        t.daemon = True
-        t.start()
-        e = q.get()
-        while e != end:
-            yield e
-            e = q.get()
-
-    return data_reader
-
-
-def firstn(reader, n):
-    """
-    Limit the max number of samples that reader could return.
-
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param n: the max number of samples that return.
-    :type n: int
-    :return: the decorated reader.
-    :rtype: callable
-    """
-
-    # TODO(yuyang18): Check if just drop the reader, could clean the opened
-    # resource or not?
-
-    def firstn_reader():
-        for i, item in enumerate(reader()):
-            if i == n:
-                break
-            yield item
-
-    return firstn_reader
-
-
-class XmapEndSignal():
-    pass
-
-
-def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
-    """
-    Use multiprocess to map samples from reader by a mapper defined by user.
-    And this function contains a buffered decorator.
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param reader: the data reader to read from
-    :type reader: callable
-    :param process_num: process number to handle original sample
-    :type process_num: int
-    :param buffer_size: max buffer size
-    :type buffer_size: int
-    :param order: keep the order of reader
-    :type order: bool
-    :return: the decarated reader
-    :rtype: callable
-    """
-    end = XmapEndSignal()
-
-    # define a worker to read samples from reader to in_queue
-    def read_worker(reader, in_queue):
-        for i in reader():
-            in_queue.put(i)
-        in_queue.put(end)
-
-    # define a worker to read samples from reader to in_queue with order flag
-    def order_read_worker(reader, in_queue):
-        in_order = 0
-        for i in reader():
-            in_queue.put((in_order, i))
-            in_order += 1
-        in_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue
-    def handle_worker(in_queue, out_queue, mapper):
-        sample = in_queue.get()
-        while not isinstance(sample, XmapEndSignal):
-            r = mapper(sample)
-            out_queue.put(r)
-            sample = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    # define a worker to handle samples from in_queue by mapper
-    # and put mapped samples into out_queue by order
-    def order_handle_worker(in_queue, out_queue, mapper, out_order):
-        ins = in_queue.get()
-        while not isinstance(ins, XmapEndSignal):
-            order, sample = ins
-            r = mapper(sample)
-            while order != out_order[0]:
-                pass
-            out_queue.put(r)
-            out_order[0] += 1
-            ins = in_queue.get()
-        in_queue.put(end)
-        out_queue.put(end)
-
-    def xreader():
-        in_queue = Queue(buffer_size)
-        out_queue = Queue(buffer_size)
-        out_order = [0]
-        # start a read worker in a thread
-        target = order_read_worker if order else read_worker
-        t = Thread(target=target, args=(reader, in_queue))
-        t.daemon = True
-        t.start()
-        # start several handle_workers
-        target = order_handle_worker if order else handle_worker
-        args = (in_queue, out_queue, mapper, out_order) if order else (
-            in_queue, out_queue, mapper)
-        workers = []
-        for i in xrange(process_num):
-            worker = Thread(target=target, args=args)
-            worker.daemon = True
-            workers.append(worker)
-        for w in workers:
-            w.start()
-
-        sample = out_queue.get()
-        while not isinstance(sample, XmapEndSignal):
-            yield sample
-            sample = out_queue.get()
-        finish = 1
-        while finish < process_num:
-            sample = out_queue.get()
-            if isinstance(sample, XmapEndSignal):
-                finish += 1
-            else:
-                yield sample
-
-    return xreader
-
-
-def _buf2lines(buf, line_break="\n"):
-    # FIXME: line_break should be automatically configured.
-    lines = buf.split(line_break)
-    return lines[:-1], lines[-1]
-
-
-class PipeReader:
-    """
-        PipeReader read data by stream from a command, take it's 
-        stdout into a pipe buffer and redirect it to the parser to
-        parse, then yield data as your desired format.
-
-        You can using standard linux command or call another program
-        to read data, from HDFS, Ceph, URL, AWS S3 etc:
-
-        .. code-block:: python
-           cmd = "hadoop fs -cat /path/to/some/file"
-           cmd = "cat sample_file.tar.gz"
-           cmd = "curl http://someurl"
-           cmd = "python print_s3_bucket.py"
-
-        An example:
-
-        .. code-block:: python
-    
-           def example_reader():
-               for f in myfiles:
-                   pr = PipeReader("cat %s"%f)
-                   for l in pr.get_line():
-                       sample = l.split(" ")
-                       yield sample
-    """
-
-    def __init__(self, command, bufsize=8192, file_type="plain"):
-        if not isinstance(command, str):
-            raise TypeError("left_cmd must be a string")
-        if file_type == "gzip":
-            self.dec = zlib.decompressobj(
-                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
-        self.file_type = file_type
-        self.bufsize = bufsize
-        self.process = subprocess.Popen(
-            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
-
-    def get_line(self, cut_lines=True, line_break="\n"):
-        """
-        :param cut_lines: cut buffer to lines
-        :type cut_lines: bool
-        :param line_break: line break of the file, like \n or \r
-        :type line_break: string
-
-        :return: one line or a buffer of bytes
-        :rtype: string
-        """
-        remained = ""
-        while True:
-            buff = self.process.stdout.read(self.bufsize)
-            if buff:
-                if self.file_type == "gzip":
-                    decomp_buff = self.dec.decompress(buff)
-                elif self.file_type == "plain":
-                    decomp_buff = buff
-                else:
-                    raise TypeError("file_type %s is not allowed" %
-                                    self.file_type)
-
-                if cut_lines:
-                    lines, remained = _buf2lines(''.join(
-                        [remained, decomp_buff]), line_break)
-                    for line in lines:
-                        yield line
-                else:
-                    yield decomp_buff
-            else:
-                break
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
deleted file mode 100644
index 107d5912e..000000000
--- a/python/paddle/v2/reader/tests/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-py_test(creator_test SRCS creator_test.py)
-py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
deleted file mode 100644
index eca2dce11..000000000
--- a/python/paddle/v2/reader/tests/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
deleted file mode 100644
index 7fe374e66..000000000
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ /dev/null
@@ -1,74 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright PaddlePaddle contributors. All Rights Reservedd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import unittest
-import numpy as np
-import paddle.v2.reader.creator
-
-
-class TestNumpyArray(unittest.TestCase):
-    def test_numpy_array(self):
-        l = [[1, 2, 3], [4, 5, 6]]
-        x = np.array(l, np.int32)
-        reader = paddle.v2.reader.creator.np_array(x)
-        for idx, e in enumerate(reader()):
-            self.assertItemsEqual(e, l[idx])
-
-
-class TestTextFile(unittest.TestCase):
-    def test_text_file(self):
-        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
-        reader = paddle.v2.reader.creator.text_file(path)
-        for idx, e in enumerate(reader()):
-            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
-
-
-class TestRecordIO(unittest.TestCase):
-    def do_test(self, path):
-        reader = paddle.v2.reader.creator.recordio(path)
-        idx = 0
-        for e in reader():
-            if idx == 0:
-                self.assertEqual(e, (1, 2, 3))
-            elif idx == 1:
-                self.assertEqual(e, (4, 5, 6))
-            idx += 1
-        self.assertEqual(idx, 2)
-
-    def test_recordIO(self):
-        self.do_test(
-            os.path.join(
-                os.path.dirname(__file__), "test_reader_recordio.dat"))
-        self.do_test([
-            os.path.join(
-                os.path.dirname(__file__), "test_reader_recordio.dat")
-        ])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
deleted file mode 100644
index 6b680e39f..000000000
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ /dev/null
@@ -1,178 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import time
-import unittest
-
-import paddle.v2.reader
-
-
-def reader_creator_10(dur):
-    def reader():
-        for i in range(10):
-            # this invocation helps testing paddle.reader.buffer
-            time.sleep(dur)
-            yield i
-
-    return reader
-
-
-class TestMap(unittest.TestCase):
-    def test_map(self):
-        d = {"h": 0, "i": 1}
-
-        def tokenize(x):
-            return d[x]
-
-        def read():
-            yield "h"
-            yield "i"
-
-        r = paddle.v2.reader.map_readers(tokenize, read)
-        for i, e in enumerate(r()):
-            self.assertEqual(e, i)
-
-
-class TestBuffered(unittest.TestCase):
-    def test_read(self):
-        for size in range(20):
-            b = paddle.v2.reader.buffered(reader_creator_10(0), size)
-            c = 0
-            for i in b():
-                self.assertEqual(i, c)
-                c += 1
-            self.assertEqual(c, 10)
-
-    def test_buffering(self):
-        # read have 30ms delay.
-        b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
-        last_time = time.time()
-        for idx, i in enumerate(b()):
-            elapsed_time = time.time() - last_time
-            if i == 0:
-                time.sleep(0.3)
-            else:
-                # read time should be short, meaning already buffered.
-                self.assertLess(elapsed_time, 0.05)
-            last_time = time.time()
-
-
-class TestCompose(unittest.TestCase):
-    def test_compse(self):
-        reader = paddle.v2.reader.compose(
-            reader_creator_10(0), reader_creator_10(0))
-        for idx, e in enumerate(reader()):
-            self.assertEqual(e, (idx, idx))
-
-    def test_compose_not_aligned(self):
-        total = 0
-        reader = paddle.v2.reader.compose(
-            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
-            reader_creator_10(0))
-        with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
-            for e in reader():
-                total += 1
-        # expecting 10, not 20
-        self.assertEqual(total, 10)
-
-    def test_compose_not_aligned_no_check(self):
-        total = 0
-        reader = paddle.v2.reader.compose(
-            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
-            reader_creator_10(0),
-            check_alignment=False)
-        for e in reader():
-            total += 1
-        # expecting 10, not 20
-        self.assertEqual(total, 10)
-
-
-class TestChain(unittest.TestCase):
-    def test_chain(self):
-        c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
-        idx = 0
-        for e in c():
-            self.assertEqual(e, idx % 10)
-            idx += 1
-        self.assertEqual(idx, 20)
-
-
-class TestShuffle(unittest.TestCase):
-    def test_shuffle(self):
-        case = [(0, True), (1, True), (10, False), (100, False)]
-        a = reader_creator_10(0)
-        for size, checkEq in case:
-            s = paddle.v2.reader.shuffle(a, size)
-            total = 0
-            for idx, e in enumerate(s()):
-                if checkEq:
-                    self.assertEqual(idx, e)
-                total += 1
-            self.assertEqual(total, 10)
-
-
-class TestXmap(unittest.TestCase):
-    def test_xmap(self):
-        def mapper(x):
-            return (x + 1)
-
-        orders = (True, False)
-        thread_nums = (1, 2, 4, 8, 16)
-        buffered_size = (1, 2, 4, 8, 16)
-        for order in orders:
-            for tNum in thread_nums:
-                for size in buffered_size:
-                    reader = paddle.v2.reader.xmap_readers(mapper,
-                                                           reader_creator_10(0),
-                                                           tNum, size, order)
-                    for n in xrange(3):
-                        result = []
-                        for i in reader():
-                            result.append(i)
-                        if not order:
-                            result.sort()
-                        for idx, e in enumerate(result):
-                            self.assertEqual(e, mapper(idx))
-
-
-class TestPipeReader(unittest.TestCase):
-    def test_pipe_reader(self):
-        def example_reader(myfiles):
-            for f in myfiles:
-                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
-                for l in pr.get_line():
-                    yield l
-
-        import tempfile
-
-        records = [str(i) for i in xrange(5)]
-        temp = tempfile.NamedTemporaryFile()
-        try:
-            with open(temp.name, 'w') as f:
-                for r in records:
-                    f.write('%s\n' % r)
-
-            result = []
-            for r in example_reader([temp.name]):
-                result.append(r)
-
-            for idx, e in enumerate(records):
-                self.assertEqual(e, result[idx])
-        finally:
-            # delete the temporary file
-            temp.close()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt
deleted file mode 100644
index a2a8d47d4..000000000
--- a/python/paddle/v2/reader/tests/test_data_creator.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-0 1
-2 3
-4 5
diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat
deleted file mode 100644
index a99a35bb829e066c4845d0b85b96cd1eb3a12491..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 76
zcmZQ!W@4P2Bs!asfq}sSh?#)+KN|x>v0q|9K_sIV14Bftj}1RiRKwGd%hQO<)0nHI
Tz>rH1B4onlY0Bkk1`z@P(}N7c

diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat
deleted file mode 100644
index 17aa89b6796184407e83246d3f342a55a66b4a69..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 88
zcmZQ!W@2QOHw<B9U|?_oVlE*5&&I$|?3Wl&5Xor9z;M0c)+Lav0f;aJ5k?@w7(|$W
R2vZPY1|rNsgawGO1OWMk36uZ;

diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
deleted file mode 100644
index b4333ed53..000000000
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-py_test(test_op SRCS test_op.py)
-py_test(test_image SRCS test_image.py)
-py_test(test_layer SRCS test_layer.py)
-py_test(test_topology SRCS test_topology.py)
-py_test(test_rnn_layer SRCS test_rnn_layer.py)
-py_test(test_parameters SRCS test_parameters.py)
-py_test(test_data_feeder SRCS test_data_feeder.py)
-py_test(test_paramconf_order SRCS test_paramconf_order.py)
diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/v2/tests/cat.jpg
deleted file mode 100644
index bc1fbbd371216b9904b522ed302700c79d2e4876..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 57218
zcmbTcXH?T&^fefI??LIk1rR9#>Ai#!N<tTvCXfh7mm<9yL+=r(352E~9psTBozN1B
z2#QnzL1`k0I{e>v*34S-X=ZL#@+H5Ob$9ML`|P{V`M30M4Zvz<Y-$W3BLe`)u3vzE
z9{~mcT54(<YARY98X7t}T6#tx6C(oyBR4w-3y_~jP=KF@k55QUPEtrjR+NuV>JCU&
zUI7dS3rebKsVZv9DS;LL=SRrs=;#<37`d33xD<u?gcbikuYY|2Hd=B=3Ly$I5db+G
z83h~JzxM#a>wQv@{jUZ1KNlG}1tk?V4J{o#!}SAgtN?N{3JP*c3Mwi}%Il}2ufGRS
zvQe=ME9g;k*tpS%1OgS~vnyyt^_%-SVY5HPl-z?7=;&|U<l^QLmynbKNrRPD?x?D%
z8yFfHn?Ov>;I?-54vtRF9-a@qynPVJ;E>R;C*cv1iAl*RscFwpIk|cH1%*Y$n93?F
zuDS+aTlc1=wXMCQv#Wbx@cq#6$mrPk9Fa7?u(<SL`OEso=GOMk*WJCN<CD{~pXa|W
zF8>D?8Gzz{!@B<b-@yJKxY(|7kyBDqP}2MlE;8~b*KZ0oN-ALmYIZ#v8n-|W5yf~~
zpni5mb3dJ^66^=3d(bTX4KeT+@uUBN_P>z*zXO)={|njw1?>OEwG3dSAiF+13O0Za
z;ByTOwm4ChWV29^1<SQ9nn|g2BR~-87S!%}^e(V-ygR*((Ji{Td|p!Ca9t;x?QQrS
zP73>G&sL$D?W+5rz7snkNFI#nfYUAyBkLzPqV#6w1I?s2?(ioJO`>i6jf|D{&5RM_
zJ#OwIiU<W58%n{k{f4lNX**8j1xi<*-UZ%mW140Sih|KQzliJI?ITjD!pJ^gv6Zy|
z;sT0^JeRCGrvy<Q#~vqDV@=LxA%Y{%YP4ou&*c;buD}80S*S26a2y)tr-?#fg^e`f
zMyhZ!A7C-Anw%J4?S(#pcs#3ofl<<>AAW(MtKNQ2VeqUn3pSJNR$YWNun@s8Fk*2+
zmoS(skO#yh!~<33al=t#xTU~d_>&<i<w`Q=Dr6v-XcV8-Re`vNU=6m73vkYNK>2Vg
zGE*_qqSi=wkdqog%^pj~>Rv;GdWN(PZ@>=6ciTl!IZ9L@Mf&rdB1uumG5q++-D=Yt
zpFlXIFHC+tRfwxi8il6MhI%cH0dN(Ou(?j^@aiWx2^1o3PPLl34a8>~zWu_3YXVXa
zW7FlVj3j5h1?8$H&b8#FK&Uif^fC$sMz_GBRmviQ5U9AYvJnFmHx0}?6b>|rv@tbJ
zbC#;aQi6$=XV?*7-?L=h^@b+AKwDZdlN8XslnU;eN*`E_3WW|!lTgyiwe~4}0l+m7
zm3l=Jj}aIZQJssCqhN71FbKsGY($~JPJ)C2T!?Xwh!`k$v#ci8E>VEJa&o~l2`^E)
zpn<I6Y@0SSqy#R2J>N=M05Kj>Zn$0ud_MQmHsv~NkzSnKbz}-d;<v;NSd-u^&JooZ
zcQSJbl|~zw%t)NNhzuNp#c6>UUFN)U!EF$%Ft&!x6eY_@rHuitUIzmdT>SzHfN_rE
zYTREK;Va2$tG8?1vaR1-+8!sSO`e#k!>)stt*F&25vm+WW?BVa*)FL<HL=cAQI{GC
z@y+S;QCZdx87I`(!1|!A6^LI|z_^>JFoeeJLQ7U3COuuf$R{mDpfw4e&WUII2pL4>
z0y)bKRm<U{Lw7hCJ=4N*5*SrnEg5!hjz<fG4%~%;TT}AigHLh5Bv|9UQyxLf`=Hh@
z)_wWV+ejarL=LyBHKMBu+?~LG4X`$k8;1pY<-gZy1IEpot=~Y;!Mw5<^ma|nEohf3
z-Qd|aO$47J*7cXD0GGM-7nJh!VW6ez5*Qa5TnfK{yPx-htPkD{0nIbT^TMH8bxKfL
zw|z4z{$1UAR`(azo1K$({APj*D4$V>L1g8m4$MvMT$_ldW(X<PFP6YrD}HIm_sHZu
z4;xi{Cu?cz=03zo8s%fShuFXwR+nK~0rPC&pUT|I7J3*ni&^XUz!a2=HGoPE)Cb!@
zVJW?GN=VBd`cJUn);y79axBA;2qUJDGtj`$5RUY6Mx(^y)2>fbsK(wiyG;cvQe=;D
z6{>+N8cC_uS#*xVK*@-8wB5}h_P29>L(ukIKMC}*h)B`A(k){n@h&k}Pf{I70VX})
zPKJWENqhn+zyrzYi;>El(kQNxb+k1IDx8ibfU2M1i;}^E3okK<+B7CGJrn&S^RxgU
z7KU<mH5@3;_@6-Ilm*!6;)WoMV0sv+byx#DhX{sBWF<-DTY(26aGVC>`&OJhq=Z~u
z9WalXK7z3s6Dkj{BVz{g_#^b{zRDB0u~-5mHPPIH`XvV7GRH$DCq+iKT?l?DRZb;>
zG1AWEwDI6>M+d^#AV!XC3e{dEq;YCBD`j;|ob3dxFZz~Bp1nJT1^5K2454-}p%SX0
zub>jH%mp(du)yl9&<emFq6X%Ur>s7&;qen@241@22ohjQB2Nia9`hV9Tuu)skYfxp
zfL+*Ky+&4As4-L(gT`3sZ=>gIV8VJejq>mXyk{H?Lq&t}5CJbo!I=Ol7?(hElSo}i
zz7*b;RDN`K!~;?R!QX8(v*P3s@F^I`q7MMhfYzIvubI>hhH)#Wg;dAG*q}q?00e0U
zL^=eIxaacCp)}@Qu)=({7Wp3dYBmc$)`MCgEPF_L&XalMI?OCdbJ74;K2klDytEe)
z_eGBz2wP85RM!V>!gz|Q(srv$5c%Znd7;&FsQR=lo=|z70A*KE+fY$kuSyHo>vS%z
zVb04XV=+UN6HV(n`!EJ`If*ceC}OfwuPRR~<;4FwofKgNPeB%GiEc;^(OawJ)Htl#
zCUwm$0}T!W&^urbXsxRzPdNU>iaLoh5d#BR2U?0Wiln7sfj{xTNn?&&U(7%>W&zkD
z28ao8&%jworW3#C%_#sv+rVC%q3N?=KJIfrv6U5-Mp_7v`YZ+ku)!d}Ue{DqjFpIo
z=;;EV;5dEUXN@Qt0m9Y9CD%1lsK|{%@XC6|=?8PSyEw&?NVv44JPwPiV80~{Ze38W
zWE}=C!xzvnAHD13p#lMO$ck%f@kPWuB6$EJ3JxUhqCH^lrA8`QOhdw&#7witaHCYW
ziT;{ID2}skmPdHFh>T#moE3`wAgf?v<}!oeaHCtt_(&9E3dkTv!r(8av_`;n)$91M
zFo?9lI3xD;c|1sS*^t(~`ochL;B{pn`ndZ9l+)`iluA>%b>fDkfUN`lW=vez%0bpS
zfxIp(@)=;z!XLC<G*8Pl?reEliPtCAwjR#=*s~JpYT!(UDdcFP4?jYf-U>J#!3S4B
z^)P~%xUpa{a8eDumk*P@$N(|^I>h>Bgjexk<~&=lA}qW*R}cK@EQ`5@-lxW^L3cP8
zoDL;GSg5i0+h~<VFo5=&cr&PRbzWH{se+5BF?XW^>aD;6JhA|OF%#v7_z;xSk#Qby
zC<wevt%f+`0$&lE4uMHCP%tL7OJq9><ByQe5)Re_=UMkZH=_c;=9-dlqlchslxynW
zN|e!=V0KO2COY(xNrd#csmJ1Z=@Vv_Qyt<l9;0%fw()rsSrg=Y^67|;+4o29Cg>fX
zhy`i=16-bUH2b;iWw4EL6^U|xx9o4stsYd&$)E^bjvcOtR*wPQjkc5)=c;Y7&n{cn
ztsicPR#dZnYViA*wE|lE9YwOhIo7VB2vu}LHTZ$I3F<s%;lI-@u%k^pXkyWI*?K>G
zjsq?S&C>>8kpa6<iTI^tJlu?w*m#41<CdAUGM{ZHwhvKFB;3IPQrbi=XGSo(;q{PN
zVbWH$H^xrc!ZYaz0!lhDWAggI7!ATwqluj<cg)b3?7Ew(weD`Eh;bwzv41;Bqz~9x
zT|qF-kp4zNTfL6GR#;hN=DNVn?S!nnygdYX{3h@V6nL0dgaN{cCK82eU}5HiYZ54p
z$qO5+rmzAQ*9hoN15O26M}+vRIvzB7i{twA5lM5o7PTl|_B?p78IwwbsSpl4<0FCb
zXcEZhg`ZgX`?#u<RDqYXK_)>)BftT*+9)`rvB8EJ#0AfErX|oZvGOQcJ;Z0|RXocD
z)#lB6YLfQmMI?Yp>lmH{lB361Igb}&;7yal8nL#rpq5<}PKN^%&HAJ?Np(mz7zj(b
zhcLHwol{t^+vvQpT~dR})Fj3a8TNC4YPKukRs^X-g%9hPVKk9z;ST@-<}lU8?@=tg
zTrL8ASZ}qUlz3r=sD2ZbW+sJIHbpC0T{GYS-XfqfUJne&d$|i0mlvRkS>1<2-<&M9
z@Jasz+>FI}4V^ddJF~AUzshq04DwO?&*QjiZdtws+ZO$SJ0(og<}#~C#ygFHJER3}
zHDF&R1%Ir;YuYOi#=KTGGps~fLmFpB-FLTxXf-`UUd@ZGX5O=DeM~C2W$ZIG;^1IV
zA`ojp@cn=cTl7&o{?IAL_bU2NT_;MWvOh&^O646ROLc6;m2?lH`FZ+;_J>rvhoYi0
z8D}oNX<PSGO4<`1<XijyJq#{`X(5vdr86u-nT4gLpTGRF+jj!~*>-tN^+a&yj5D<I
zCaqJu>R02#cMl94N#&TCVXUZ)vx!1e72XX^vQQ}z(BN6^BL0Ov&rvwuY5oT|i<KVl
zN~$kGvj@>!oD%}4UOelECB{R$*D+S$b+yZo>J3q4jERx~!bp{AgbT;kQLWu(&p@kI
z-64WQOJ$LNxP-;tm;q4QiGkB-3G}V&oi$SH@k`k>oD!<l_#yRd(YrXqkv6kIO#C!B
zo=E~tG~Leoft>YmAtr+vG2uXLf11w}2nr_o8}RXzFqv2q<oSl{#};t6tkr7WJ(LJq
zP<=~g(0Gh5>~4G@lcl<y&orlcsf=wO+*I%MJ5(Z<Y@h(c1OfxIl88|=h}mk$sOl*{
zluAtc74`P+QsmsB<Z`KpY~Yni(*bM{IXBi0mW__0m;K<9;Vjb5{7Fa9%_>7hRKrrs
zio8Q;Hb@C{(Zf8r59e2zVtZyjZn~LET4)*^+auWXKueC_J0_@2<J7n8SL(7DHZ&l+
zXa4~7yY6@8DU#_G+F~ATO20s;3_E0FPcmINNWT~U8=ja98W@u??4ykDy>M3|mY-ck
z*;1C))Hd?|B3B@{yl{^y({suQ*z{I?g=}t5)vVm8Vw{7g^>Ob*IWhJ^h;n!umi(D8
z^E8kHD)9o7hcak^Ae4-e-y|d>^I14KwV<l+4O<t$UF((E_2xiHXHXsK+jf#Lm#tA0
z2}^WNiE{<6LB($w#l?Js-Kgdmx&!WOvjk3^esPrdjGNo8x>h_`2oIthqmO7Jv!bpd
zXxjVI=~dKH2`fV=!E1=p(=L1tk73`D6?KJu&32ts#fR!#V+8t<H95ja2~1&q&L#!^
zc4Xs)rN(e!;ZW_Y3=mj`jTlT&C4wYw2!KdKHK_Wy$^-<6u1?>58%<Oq$RXktH;>gb
zkkFW&@#ch8YBI8n&ygs0hS256M7y_^_dbc(1X+8YQFlvyeWAtopR^sleGrx3WPsk)
zzPl<(Na292N5|lxbY$Pr{tJVYaB<|qeEV9=z5c)irG>VP)GnBcGD$W&sgpj344tLl
z(R_M2r|x1RSingNv-cbPq4Os)*)yQLR$=3vX6$Djl3m$>VY^WV>>y6fNO08S>gTJx
z^>Z~bY`b(sPJ#5yRE~!bezhb9xD&C+BE})1lWuZxRX4Nnb~3qR{HKhhPFf!}`|Zjb
zN`s6FvXM=z<BB1zV&*?KVRM0+kJ=K|H!^m9oZ)Z#*_$wy`%0@Qo;Iw2Eu5@u5^VTl
zE!%3m@fYZY;~c&X>A$^cXEYy=_npk>%_fTtEWb|YT7E3BDZx|)a(w(xB~D@YUXXs(
z_OMM)r0tV1wpBxigr(s}rM8XDV{pdOW;I!D+r<9r?ySIje9pRn*DgWQy`M7nruMo3
zp7xe6+NL6`bBtvn&Y^%Owc_#l@gu%G%gu9R0l!Y`7oD9)lS#2RcP+uIsPvnUO7iew
zpzr88s*6~juaCiDY%i-M@mKo_4W1U{3Is{&VkIn5I-?aE414>Kc0C)eIvtF8!F8;$
z^z83GPa}H%gI;VNa~sb<*xC+vZF|+`6PX6F?5_wj&)n}0NsdDCeaeCbP8sSH@y=on
z;;0G*b5z^r8nZg-p+ebsUZwdnW+kXdwcXf|a$fvdE#x^Hy#cJ@VY*x&nRebcK3o{V
za1}WSnv$lW%pFl?qN}7BtL2RF`o^THZ=U?!;r@zH|5qLObzSIhb;y(?Z#-Bk{%-Dv
z4022yQun1M>j`UQe{(x${97E+$~i)fFVHkIk9p_9c4=zR1iTkwQ|OTae4>?X)^zcr
zN&fk#TH`Hzy8(~C<V<M2yTpSmmXbf;!loY|)rYBQ4D>KH-<}NqOjNGnY%-K2i`wCq
z)cR8oAwKqaQ^0FJihEoWz?O2aJAP4cuJu{M`0W=HR8bUtb-8=iAGcK(ceFY@|3Kk3
z%KHUUljR{_X-JDk>Ii-_w#<R&x9i^$-T;L{xLk5#%0}%F4>2xp&&e4=zkG=C`UeP!
z{2%^?FGoEJOo=`m^vSrL7p=4?@L$`DC$YV8c2;?A`f{AwrGL-bNcK2se=9?W9@2NQ
zThe~<fDb_LLcr;*j8tw_9PzBgMOLoJ<@o){uip#ND=U6nu;7`E*YD06QuAj?StjMz
z!T98kEq}&uM>K6!anMlecoh8rTmIvsr~d$}r<eZ#$-ezzZ+^vye6t&&$cR-RQRN<4
z<?~+X+W%;rDe%@$MIpnA9|WjE&n<$+--GjdYc*zsx!_i2!2*b+6R1doxCDX@s~pC`
zkBJnJ<|Cm<Gh*hY(sVqfgdvCQx|Es6H8R`ft5$Hhn`09%IEfU2y=7t0V%lBhRs9`1
zoDK2^QP@V+;yTO2z%3{x2ttP1O9$Ljr#ahQOB<-=Lr@Bp^YP$QC)G&<_u=^U3UYp^
zL;`}Y&WH|UF~?zwNujJmtdWF>{p)qYfty4+96^45#es2#2ZNC2Ws^8W+9}^vRuquV
z_9=mJKv(s^#RjI;+bQ4`Qd&0RLp6mN6WyNhDbW&`WS_>vU5ZdLL$nUL8>*_E*0zl%
ze!M$c1t`b6i-yzm-hZ1pN%WA`1LbcXoxZ~GRA+pnY_#M1(_p@xTd?uE4Uux38!mX~
zNA8r?UI}tnXG(?9`|5H&MLfe^WT$2n@(-Ys|H$KWZkN~-@eD7fGx*|<+7?=>tHqee
zI!wroJM5h-p5qER_fTsw`7IA0F;en|XKzm{O1RXXQXV<$GaY)U4FT7iS8kXZzKCKe
z4n9-cI3_@fvwT}xVk4B=biP;8T|K(z_fi)jahw+vp#J-b<PrG4M;_;_hZGGt36LuV
z)K{D45zxv+w0cCGJ9_CGmM=z2x?bjPnap>O?Cc(aD)iA^j=Rq$E{Li0io?cVOn!Iu
zUBx<AXa@Ly1eW?fCn@niJGF`bMH&}Hf3Fg(L9!RhPc=5JdH-(ws6DdK-nCj9m$}dd
zRx(ogmGOglk#&tRif`kG@|@$RAr+g?lxW6`cYh+shBpek4^w43WfJXSp&|Wy$dcO4
z9)WT+s^MOC3a_SvlD|Yr9ppor*rT>c_|?#6Si{oclK`lNAiZ$}&2e?Lm3ZrI)>i@1
z(v3q{e-f#r_Q5L<K~iX^S36{;_vPgI++cxAaE5d}!tbpsA2XBRGY5pfB9ymZfYf-y
z^Q%$O?HX_pBQxo1>wp8({@oz3Z=ax903aWD;ZBb-_Sk*<L!DV#^)xqkSmwzSHIdRJ
z&-vpDb(!F@zhl9|z5$&8JDGF|&&=epu~+Y<Ha!}J(|LvB3Ctx<C3uN|*$<{Wf`u8R
zT&FkLBLJ@+(_8K?S5?Z+cOh77ETipjv9!xK&BG&X<R+r!061C_l-<9>NrF}z4+2~{
z>>d)kwbf4XAxr<gno~<P*bd$&sLzqc<RGJb3;lCtfnJ*>3mL_@<i|yUs7$Xz3;Dxy
zFRkhVr>8sjt@Ip(XG_0eOsQ{bvLL^{#m`Vt#G7veiYOY9wL|_Pb<zt;p2VsI384_d
zmqEe9!%rl0F^3QT9KZ2ST)-RP^tQn{=#vr~DUgh)bgK&nvSrkh{{;%=N9UB~oAS}`
zZKGDIhjc;NdFlFf@fp%u7(G78U=bz>Qy@!s@%~;k@_083`{<_q=~?}#fIWS$R`M+Z
zp)DsUSh9Q*vj8Nj*nVMwwKUkz6y$nww1iN%;7I??cM0E;P%j!8*J-?|sJsDESu)ak
z$Ss%!OsDiuP~&%}sH|Onmn}HDFq>(J*dDEF6@Is9z@}9yqV%h@)x~+rgS)_V@pp5l
zV8zH;rMZ?RV|1~5j)<z%&ZXe4b!!LmiKgP|pg(WiRa2~jgJ*}4%we(f-Ke)r?YSa%
zybY$WXgy<v?Zi7epQqfbi{L)JlKyQPRMN*IGN}4#%c{F%<A+GL|LdRm+4C$mT{oG2
zd0?h&p0z#{U)nje?$G25Yq)Uj%&`;|ifrn;+Q)Xk%ch{yg{iLc0Aht5r{0+T^?dQr
z<;Mp}!7{TkY(~;&(9GR|5rcMkhrg-!&apt7zCG3WYkJj7d+e+1uM6XM`PgN+Uk2|x
zE`H4{d#@9nF=Yj$e?EByva838C>d5JfXcT^kPSCPBcG*P3{+_HAeaYiFfLws@fJ|V
zn%n$7AI*%ZXxZ6bpc+MOU#-{$1tGJR*@B>!<)pw@P-nxMaah`9pbbZYk&_InQIg5P
zaEz^M$fT3_sklR;dH`|<9>4ygiI@s>DjY}hm<0moWH=a@z{AKu{IoMxxQ{rZ%&H9b
z5VKWa7de4%kSDp&2Z!L?=S0B6;8%e*O4dt7*Z%3sEGAA2eyuj)R;v!IVg&G<R~Dw4
z3}Z2i@kf#m))^_Y)}VZ_rrini+~5UYJF1a_?JS<>p}f2JyllBETYRwX`R07u3lxsY
z#C>KLVnmM^uFvBU_HP}CsOiGj-|3Rgc8Zk0((7-0<m6tk`bAc6ID)1mT83A(zFkRV
zEJdeiGsbRK-#*)RYXUqblHeUI{<fgyJ&UlX@VoD&!>#O+WJQy<@#kUG(;M+lL1TW1
zC;vUZwG_n9nOOEd^a{Bgo@wQl-;rLX6B*pE;}WZMKQAJ(^kY!Y+feYWwI>t4y1f(S
zHe~^HW0wnA;TskJda?-xX&t#U)aAx_yp#6y$oS~!puop;y;d`eOnmB!V%K1dCZB6G
zvMXmP!~X9rrbms2<YQcjZLWb~vgK%bRf>HB@%M4PlV2c!v_&qxz34rD#bMz42~hO7
zneYAo{w77*)#y&@+DY?<d8yeP+J@>lE3ZEL;Db6~3qV=wZh64p=^O}reyNZ*!kb`D
zG3DZ>aC)dCJ3k)>xc5<%+sl>gv3}r8q!hkLwkAaH=)48Z9vu{0Ch?qO((My@9CsfD
zpU{2C&-;l-)gu85{Wl`SlX+j}Y-InP`4+8kmmHWXIsOj-3C`nXkUtaEd9oMic&?7T
z$*%PFZsuhtJgVc0;c|nsq>8klXk{GIPSlI`r-$$lYy^Bis|%75O&h)FHHjQ)yK^Jx
zMb3MZubJ<B&Xha<0d}R|3<r;@!HH!gdrslW*QCIwuU2U_KZ6ooLr3PSe_M)WdB=t0
z6HVke<3CYTNAWbudv^Qt`kP)Zf?J(=Wt8CIs%q!u(<LW!b@hL~Wbgj6F7M*{8cFI&
ze(og0<GqKNkYNQr`D@3K6SzB9^W4YVe-MS0Fl#Phk0i}1rf!muOX%i3t1a8opqBVj
z4ZBavxsP7Ba8CD{^!S#}Q$C?eiHfw|`tQ7)aJSH%U9KusDABC3-2j7otj&8=E8y1J
z{N3t>?iMKrp*j_bNn;|!V-jO5nJikQ3$zmol?N8YDYjNouf>X0rz1lO&07+$S@X4H
z|FlLj^SzY9ppXE^%~($8_P1f_1`BGdr$o|WkST6*#8Yc~*cd5%XWTL}48N%(7i~IZ
zU%DnnLt=iUGH$k2rC`LRSt>30GJfB-z#)sAY%08##OImu5)bozTFb1+R#H}!Aw7g;
z67|fEi&gh)D=YfsTeHPiG8=*C5>*&_qpj*5A!x+6xj@?e?9g1Y<0slBe!BaJLn=#z
z?Zkucx90+5A-P99QfyiuFukTku<eR?wIyP%d)*<6qv}0DVdni3fTc?j-m_^kXQyY=
zZI8d|r{{lA#7!y`m%mlO{{vL(nR&D@&x3k9T@$>sl|J)%6DeLrCa&0x-ogMHfdtJ5
zA?%-3e@gOYj(|@X1IncDJ~dpmi~7s2BbU-@BRs~<5*&!sgH*_*_a~5ZMIK!Utj+z`
z5r9svdRCF;Z)%(Ui|+|%EuZd<<PH0@oY_%ogXrMdUXz&0mP7DT*<`yfbyh_7s|d>2
zCySS87}9>W@FvZXa6P`|hVgAS&D6A=qmc*WmEFwE_1VB#?XWSf;phmi{;5aGZ}*=Z
z7ainViuw=r#Fhm*2WeP&a2T{XO(+=5oNfqO&$IbEE17WKXz+M2K=AiWr@-^Au}^$6
zId1qJBD-74dtukplQNi;-B0-xdt6h^vWB?4_hvd;V4hy-MCz;7ko#1rH&0|isNs_P
z^KwIe=);oRVt+Yn>pCBe(cZuE#(Dl1>~a>P@^s52LgiYy8Yxqmy{s3Jb<!JBMIl3K
z)lojJY3YW8m=e^pZekM{On;wb9clZ90=%$m9Be@D13|EABV~EsIB;^@R>r!IH+@Z^
zt&b0{J9<stNYU3=MPbMGs?FYLM>*hFnvNRTTP$}|<{|=EDTr*U#pDT9z@@?Cem6AW
zPrJLvO6R%F_!Ud&(uHy0luk>wqH4tx7!gwURfElg6Rg>0I4>*7c|&5TDZC;RNFbpH
zfsS+UVFE}IpX>|#d7$;EAjaj$X1h?+cFBN5Y_R}(V_GTtYrLv}UDd-nV4^WX36qzS
zb!2&90ld!OK5)DAO=J|$Yti*up_`LKqNefI+|nvJZ78r&($hAV<a!mQ6T7c>7hVk-
zCkTLk*U^4NMIbGFeTyj*Z7`)qipH-rN(4P@R)kH}18mtY>@Bq{mPI6Lo`bG0TFZEM
z-ja+OeEp0u7w!0=@t}lORQz`b+RwK1cbM!bXND9@JUz6}65{>EJ!J|tS(z8EG&t@(
z_(1mN;R&6!K@fG|{vv0Y+{=d`-fb4iSUJshEqUmZp*xRCr*6DH{5ILv;zRpDMX%=&
zo7F^fOx^8wTh9}1g*W9wyS?O8=$VBG1s>S9)`LU=21OMxI)q@b$I*3Sr#ymgk)@?b
z<QK*@=MM<hem@MpJB)G8z=fR}IDJ|pZ}xr0<+qxpGkCGFKWj;B`hs=YVxyTDsg5iQ
zp2PkI!%Vw3pV5A^v=w}Pid9Fo9JD<TA5r9p3tVEciYaq`2WR-n=XOG&lm$5`!RsvU
zAsDDEF5)@L{XU2PE&GFfEt{jdLO8~A3k7JGZCGT0<UUIOzFAXbLby1A-L{UfZNVy#
z6Xv(3KEF~gtF3denUJTQoPL+SAsVI*s4DnFv(UDY^68_(@NABi?1OE|&!|_kDfz3O
z_r59oZklwAYS+0WVR&*ZcJ2=Zi<GIkQ{*&CsANW0x5-u?7eh>y&do18L03m3y^2Td
zD#t^|`kQ<r+1m<dW0Hb}Teg)X&-8p@`vQe5&j~Zre}G4_GAcPLJvr*4kNJmpo&$1%
z50Sh@gYUyKOF}sL9N3#ap=s2tRj+8j#d2UeG>vAUTx;n=kLa_A`c*Po7M(G!PXz96
z34A)M@P`9lTkSeMVfz-Fx2g7(MFf}fSdN`JT=ml7);Gd(H+Z|>g}DZW<k$*aIda-Q
zcsYO{{hRMr^KlEWwsC@+l;M%n@5f9jGCQ6Paje`V<ERe22(*YdEHu_GXTwx~ucWP)
zDRgBSj>NQomKVu028f|u=QgPrEtQ<5mI&ip!r8)r-?_Us69G>7wwrvIyR{3dj^|;0
z4PWIba;H^Q8?a~14R-Sc@!T!#US!+Qgb<CU{QfB}{qGt-L<`xD|8%c~RZU;#z79^N
zhuk?wcI$cYu#B#}564oXV?mC8`%2{c>$@vn;Fa*7m9oJ`O_l*VZ;`GW{w8j=7DG;V
zHCgHK&L{M?0(wIZ%4uz5JDtlfOzzL?DovCGPXm4*S&I+4@K>q9%W(J7e3|-IZb~Z4
ze0q2#RUB4+3+=ZSEB)@VAKF|<buadh-Teg>+nYD}<hf|0NKXn6SjX#4J6={=#L8J6
zzruuEw$KrJ**t7W{O$@vO#SDOH#&T$uR`us-ShZnd~f6y_@NW&GG7#|7tMb#c7bi_
zlti~b5qupx)ruMOzL3M&g>6C_^Y4G)-i)E{w(sG-(m*v01=1IUmU0SV>&)JiO(cGw
znSb3RCE(8Ip(-AEnV=<IqVNe3%-Qu4m;B%UfyU9<eXGUmtGPj-z@LmSo+D&abbnL#
z?F)ldKI@bD&%FHk`|aBx4$9%E%~<xblYRS&m2se_Ozec_mKBk>YfqbAB*QB9m!pK^
zRO@d(Fk9W2;qMWRBp{~hYQ0h9aiF|Sael-`P-%u#XB+D85;j&0Wgz61$ouosd*KDs
zN4XnDlB*GJyV?H$`}H!ik56RC;?I+n-BgHOV|xK@SNA;w%hZ6WmbRVzfnUdi{|4H=
zJw0~UDAANvgiHm|?Hm!p&h29nXb8SO;B9uP;L+Ks^4MO12vvUH@gUPkz#3VW70Hui
z#&6TBUi;M*IsU8;@-g$fs0o0@S_iKjHf+`Mq4jNQU|V#cU3KzzWlB+IT4{*TsXNWC
zgQN?8+6x<F>ZnC=d!lc{y}NMhyZKzXq};Q*V-M|ow?a;ej7OQlV~`{iMqx-Em7_3p
zL;C#$JpP#EqXbW0$2>!dF_qXl)*C4zs?}!=`5+2VSLAe|#qkO3OKlm*PHz?N$5nIT
z2*x2d$nzlFNEJ)u6bKjeD7<e$LDVV_eh&M#j<H>sjjv3rsAZABgkxzi{~6wp)Pp7C
zZGP3}<G1*eWrmC=V7WtvcO^1x8O<zaR_M`f;m=B)q_85GMk~(p(PD%HDuNq4AdhfF
zuza?8n}-g^^W!~S2?QZ!fxXBDKl=9xd7(J+I0dr-9~+ZMbvWHNMl84S)^(5H7c0DG
ztv<6d$1G(az@&=t@|iLcG45Kg9Y+pgUJH%Zr_I`|Btn^TtpW+kCd##*N6bbK(5`l>
zsUoi>&8|)Gb{ge7-uw%7DbjtHkqg)xzXz<iWD~#Nj_&;jkZ$A6@8T?ZwtWHTGrB!)
zb#CS1eA8am&+N@Q<ONgw{%k5<^>0i*460)EyDCz^CRI(K?7@!nve7q1p|-bUsvr4^
zhfU6d_tgIZN|7Zo@8jjrJEb6D*FEi!exb-P?e*8olP}`g{27Bh`2Q?k>AVb4`YHpy
z9$JW*v+$>;|NCG9Qy%iXsHkeGUz%egb?uJ{CC?mKoTkF1@n%ChZs1LQ?ewP%YXgc4
z*sVrf&o|V{gi1JD4n$ywf*kiIVebwtjC5Pq@(J7S6!d<<ID&WQ@#^^rkn9)zw*;c?
zV@oIy<HVO{E1f-ESPG2%p>$OT5%w8=a#8ZM#yi$kon>-9t5UVtctB~^_~wx$>b3YF
zH+^N#Ws<(_wdZvBDC(Qo!hk;0AfUJMep$M0>7%eL!?Zb-*m=-~cFp2ZJ)6_yltp7R
ziyckwulb@4)Q3R&2#<HZq;_6T`$>6r%lNS4Yx#C@`VXMBH>hp*^aHu19#00#FOuR4
z){|O1)Ac51x*%#KRDPHfLfOMkE0}MhdJt@EDw-+nLDBK_ZwdYAJJJCyo7Vh5ZlAfQ
zx8yMwr-@C5UO{KKuRtAYN#M#uGZI56*cr<+c+5EI=&ICCcgY+(j2oSk2j7oYoVWa_
zkwK$qHXe8iIN7xsjP~p-Rl`0Mr634O=^TWPtOGdn#T=N~<x_o2{#!E@NDA9{y<z9x
z{9n$U<28@1c5^c)E|2a5ZoxnPM&@29nXZ-PiKL4v>sN~88%%!>Rdp=>$mkCpk%zb~
zz7{HTUO&A!^rI|~=Ye4J`(?`D2m8c@?tP7T*%W;;Uyl`U%EW!x2gUSfa*cTkWX5y{
z!&F{H6f!0yIeHsyzSAcOx@#G$^%9aN_U<>&Q&k1WyLR}YZnx+O3^Kl%+|z0`apvsm
z@fqrNDQQxDzFSQ!F;pgUU;FF?!;>#d>{Y;>`cFGcgSwWI%Fp~0uYQv-ri>@rdK3Ub
z=K}YFcGzeHTg|wZmEPt2Eujl*rkz{~)3d(mUv|p``t#>^K3RaYf18}uIXEB{%YVF$
z0T;4=nG@Gi)qxW{?z_1ARaO22sE3MsLRD7_<TgLEhpJ9#zh&8rm5z7!*>N1vFs3`N
zZ@2#kK%gWgx9sJjk7i=3F~Its?2{WhnHNLe;w{QnA?AKLOPc)eS+?xXj1-+#sI+W5
z{+wihQr1jD(wMa$EHq?cft!bnItrOT>eDg8s~Sa@PdClQp#5Be8B~b(Ize&F$*H?e
z^riPJq`q%^|KRLFR#dKLFi`x_8E#2f7$TfWPdOT+6d`LwM)A*)86|%r=dno*=!W2l
zK&YVx7R|~Wq{Cr1ZkDy=7nX2@?l|Fp<i!w@rqsDva<W{%G|ZitODd&u?vCzpppcC!
zd%r-cAPr-`SD4uJ(#zWl2GP{i?z%i(D2#m0U3Bk#(q<l7;W9oa@NOiA&I$YSoXQ2O
zB~Qb*;8hlkrwKC^g6NLRB1T9X9lz`+ZDxJ$wYJ|49~j}a9Nd~<xvkwf66+CvH|%Rp
z6}3KFJR|Lh*pq(%wqFaO|J4VGQipu1Nfv=|#eZAbxv*PW^+!zrr+M+ysyan~+pvB!
z<IvtQqh}Dzg8u!_LKT6XH8lcdn`{;ZleG3Sc_;4o(S**|6*R9;&g<#IMBh6|0R_8F
zD6~K2`{}<aaN*SC`v-V%#J7N&eBSClB2I{VDB%0E3Y{`x^egn5rS5g~I+=~iQCl+q
z_^5aJljiuGQ@(A<Tt+*Zw`}nWiEcL|-ncE&^)<Mo%sRFT-j*6N@w>7+KMPRE_Gjy3
z;&k1mpv}l}-CXOBu_=YpRqcvYwV$FVb#D~VSO?Z(4qlr?H<l*1bKa$mqgrjEh^Wfb
zUx4M=sX&p8zMYhxI*l61>5t5dmi!zT(J^*fnYfz^OrZ>TFP6ms&1){v!FlKlXJz~5
z^u71|I3_sxY3<bQ^7lgT%N-4_Ch16(AniO-Pqx?sej{hBu_pheotOpQU7_c5W}`!P
zgqG>~t3{_~jsbs>XMCA&0oOJwY@jdqICBuQ@m~KUH|fQT9x25mYt?#x=^hDapm=SL
z3>(}S^t3zQfwMc_o-<t8&5H8@4$?T}Q;QL^0D#xJ@TBb;8x%2D*T?NSdAxvz?FPoh
zy(Lg&$i(uDfxbhiE|<S8@EL+lVMs_#m6Jm4Ry_Q@x`UxdMWta~@xG<cuUb~y<nX$t
zLvKhCo`t&go;0>L(V}3u(Sli;a~*eJw%#T24dp?vyIRZNSuUr%Tye|nz((OUnSh(S
zHqy<mO0}=vOtmo^;u45S@2WLjukg@=JR{hJZP#SY+i%WsRpr8k@b)!uvmr5y%ETcN
z4A-oiDWiw!rv!u&!Q9v|j(B22ALtJPrg9i5g<iLsuSZ62ROb@pJCs$#D@HKsi>PqT
zxu(&210x9+3@5Tb1H*xt1tzBjI{=@yIo+hR!|}m&&AD#%Jy6rYs|Nm!<>~d@reR|E
zPLWf!<#r}W;+d40LXO{4kMEQ%+TCW%{&m)9I~SQB+)O;Y_0ZRpS9mz6D)CorvT4MZ
z&fBJ#;SlyapNxOMK0DPu8y{@c)J^F~??3bU+ttVr%Tf7isP&=tGtkJFp%7^jwo!ys
z2)A6Gwr?tYG`#e!@4H|ce@yQqhl4vG^{n-FPlR>wJEzL|e(PPL_gb7?IKMKCzndh@
zg%8gu-uVb&A?m!M^5fRKH4c@-^#{jZS)(KAXmnDj*iFDm0=9pU4KbjE-GLG2&laRY
z=e*`&E`0wkK|8$nCBgM|Q@T6%JTCfum|NKnCY+90FL)olabX|)*=$3@kujE|Y|9D0
zSHV};--$k@5S+qIezksV5VT?KxVUMP35&h_P^b7=?RQ+W7A-vU<ktejuN1}KI$@1p
z-`fg)<N$9=bfN2mS*3KftD2{4-Yww9-pos+6P5d1di&ukeDw8!va}Vrmz$SeLv9*>
zx`W!g%h_K;1T>W*<Nth%RywfVxT#o>wj!1MhO%0}hj%ufU+KbjHLo(GHWZu<(~jUW
z8S4{mZn!uDS@iQWuZ5`um;BTWz3zBME-LdER6Gupb{PreX}&P`@X<<hD~XO~j&d-m
zRv?iNg>rG!E{~5;rQVP9ekZ!8fx6L?j&0L_ULp2oISD;h!k%8z@laH|tkLD!Lt$|_
zF$<;_9W=)0LYtW<2acDo(xbftWoM0DRGz56o@CbJZ9e#Xbm@;!<gIu2G~iX}x8KTc
z`CO{E_B(Ljcp$887Gck@IxXuR4k)Bxy2$9gytJziNttJ(K8L-N`Jt+_1JtUI2wxG+
zV!^Md{e8E?Y4?6YHqbK-yXm>J{qubK1pI=Xw3KO`CL_S9$8Zu`7MbMST4=df;UOxh
z8}$dt)jr`T@<q>GK5(%{_3By_5-o2C_l;_)v+9iEJ(e1>sn`-To^WK44A{I(+;TfR
zw9nM#L?<!=tbpvIQ?W_GSNq!c%zP_q`93z7&t{nZeDJqR*?4u{iy^ZIojHPY$t$~6
zlI3I#(D3H|qf|EDEW7<V?6=kU^@K{p_Mc5cdGA7cTj8@;V&9mxTK)kRI0>s(|J_T}
zcu{t8X}`7j9lYe8DPH4Fb8W|-u`D!~uB3YFL}Uu;Bayy#TFUBC?83w^To<SUHBCDw
zF|twq?x^)D1Uk&u0i964TxkMFyk)g|jjxf6rZZd4d;E|=eUwEjR@A}0b5foyYbv&g
z)AE4vhnekWGQ|<Wl;@)T=R>^<hv3NQmFvE<Wxe}LjL^%$`-M}An}4Z0LVkXvuhop7
zUg)yRD^$}NiodWwcsXnnZK<Q_o_7xyNVvy~j2nmWo5$Qkz8SDcADJi#*`12TIi~0n
z;@*KIFBsve<2zcsmkyTbtsa945&jGA36Ijjcc=JQw{7*e0tC4Lp;vptq1z2r-v#tW
zh&V4if?TtKrf4WXI5(d3`?T@KGW$YzcI)GAZ>fh~s`nLxXBa~$-fcijm0J_XLj~;D
zPE18hIKHZfY%I6k+Hl{IjnPXHZ9Ww^a}K}S>F0+0fQX_?gQ9g_TEBvu?}`CR)=Hn3
zu;>+Bi&W(2pUItWZ-obf4vx{$*J_C9Y<%eH@pi+Vxi2z!q}nP*hJANr8LjZ2ordJ3
zrFwzaT8ypljV{tiFt9fNVm(IQs_5N@<*~K;TXs#cAkDW&zapa?BvOF9AM6+yHxo?x
zJ2iv%GjUpNAYnI=RW*24MFfKtrhjbdK)9<Apk(}M|1V88{Gu5`524n2qu`QX+96x>
zs42@s{AJi$WKQ@<@1w!`j~$~qG8?}#xh`YVUAzp-XYogxQj_sDGzsN#n^8Y?(qYsZ
zsqc+89t!TQ%9}LP)Q|#@w)tJ*pJnzJPwKf%e~3{%XrV~_#nUDdrtoR<W9AAQ$BjN0
zH`B|I>$2U3bh11a`t8Ud^7KD1g$v?lM^Gn%fjRz%VoX()wfhHFW4i~I+7JG=*|p6S
zn|;lfn^Xio+~__ZdEMFY^0C!}vmbwmx$$#Y@ih&8hLI9H4HZjW>B!akD?RxF@{A{j
zD-m(g#61&_J!$7{doS7bv1$*@%)IRK+*bU(@?!<!kDqn?%q-3uKmFwfAl6;Gdd`vb
z!Su@tGxeN~YjY8KZ|40zD@%O4dD}$X(TS{K)Bg;4hv1-+Xf(3|?+CP;@5Hg>!rj>K
zh~QVywdAijhV1gZ+LO$U12pIHP~F{jGmM<fBVchL(!a94b{%V9PL}KtGKHMov^BhT
zXKOQoy>%0koD_00rB-kdD@P*qR<>a(brBCHE0?Vce?-uZQ+zaJLMOtE95}KOH!ZO*
z<Z<un!#g!wdf~;8avJFDPCS!Luc327Y9hfh74=H3HusSD%ruv}yhEApDG!BLZF=Uu
z(Vb9kd~?y&8r+=75NNTAVds&AV`9Sb)<;$ZMzVBcs9L(A6#-@{Qi3T8!CTu_VpywR
zTwkJrE5Y_D$u<Pbe4ZcDcGVggLu9DaKgQQ{(9oOgu3m1l9{&LE+>6dR<VP@TkreTl
zFl9rlUWpHtoMOu>CEm4yJD9nS2zoDxz@-Fi?G=mJ)V;VXLT2cFBPh@yAf)q?&Wc*2
za0txAq(=8)){Rh_-&HmL0M2cpTqaBHl+(XOd#z_1)-~=uFAHMwTQyE#ugWrsdDPBe
zP@U7#HRWIICiGtSpu6jj<=V%t2w@Mwx`ZI@l<p61A6V^#6=pn8lcmvUc1XtT;>3uR
zi{P&9?1!6;Q+yA&wq+Yotfltif)=^C_Eei4rYUW*Rw90=y_)^_KQcJX+?2t#Z(Du_
z+3g#1T8btQMo^WX1Um{~vqPE|Yn%)q&_4j#G{tU~wC@8=Gj^?2Vp94S^S22-DR*)Y
z%6bn<qk?V3x~t6K_HUeY{oeVY<AwM$te7$l2tL}mKl4~ek1qBqoWu>duL7brNS8#O
z(P7;lrE61pq~M{K>CN_|*^%D{*`E^pQ%w-!TPR*X)4_TwQ;#eekKbr{J?*#+ty<Z+
zs0eISne?7dn%k|WrLAaUg=Qq<>C_|p;eO#lLhG@?8?t1-{>;~!8`m=-jc`Uq_2O?8
zP7Kip#SU7-U&hPu)dtx-dp0z2W(*%J!(=~<i<Y8-$GtVZ(MQ>4mo5dz8p}OOq1GGB
zs=sSmn00D!&gxH@3DUw7cqdL%;W$3eC#tIFUy<*9EFPNP>g(U+$n+n%0x4QgJSuB9
zV4S@(wIofedEF&HCFr^x<D5kQ0qkEn)|_jZ?7(R~5EfrQ_o}W=DG@azf9MD{YU@rt
z9rFE;)-AL5+RtLjOZf=75r9h>wp%uNtyB09aQ}5GfKxMB?Q*+Cjgpb9gI^;~?{C|~
zhpVi&KG*WM5DXqVzL^W)&>BJ3YQ88$p2aAfZCrR&ouJ}7H#h$#>U=S$WpB{P6ny<|
z7xR3jwZR1crZJmVcXZ0iZU3tNWgDD1Fz0ZSBBMs<@9VjaHqjma8Ug2*--6NcA5RFL
zX6VNv&x}>7ce%o*#bVwy8W}#adc-qZ_l7ZuEsR>=PoNFd*YoMYuxTH^Qij4Qrw0_e
z$M>f%h+2J<(eC-A0Y!&ssTA@^;-{DPYxN=e5w5YR*)y=($0|AIJ8@)JG^b;AN-_`3
zYE*esw3BwOkdH1LzSYtgv@(YAk6zW?=<s@(&6)HGA*#mQM|OQ9*|%>E^nDHhRNmf}
zDR~pjD@CK}t@;n3-8QFh{^3i628|W*3<v$Ji*E=jl>O}I=zz|>xUM&3cX>h@=1hlm
zCAoI@ZOqBj?f$#ZNq1j$Aj4|gNIWqj%5)=eb((nP6G(cx_>A<S{p|aQ;LdR^8Wkb=
zO1CcTIA(3*^z3?j8ZY9S86&*~@8{37n#j!Oy7*c*Cu?ve^aW-~tFpE>L%tX$Th5PI
z#%A?h)%C1geHhznN5`7AZA&XWxIm62u(<8oV?iPgPayVEX{#M-`*x@XJ!j+JU2LzR
zBRxF6=ae0XSuMf?TTKiewLdBT0p{&I$1Uo^>>UTlw(pR=*xg-3VU5xxJrpN`HQ!(X
zto5ycaTYP=i{0JKMvqm?ET=K8F1zPX6#25Ongqjt|8;D<o5V4EJa$9aMcqiok7=6f
zcYKyk7M~rrYj5|eY`~ao5yQ`f{~Rg!E?j>+|2w;lM-;(6fd<;Y_U}A0Z#U6i%OsWI
zKphbY(=vZb=L>lg<EH<P$3<xWp?Qs5>6p3k;U9n~el4k7uxbbfj@9Pb=$70S)T@xY
z<}W&O!aYelx~%c7vaVqRAp(iu#+6xUh|*PkeR|Ue7gIiI#pc4*+`W-{!QfwH_Eqhj
z&ul06g=w}wrW1Z8C|r`NWfHrDQy4Vd6}Ik5Y`Yz<#3Yd<t%_8(lP3gHELk>nhVxT{
zEf=lBxun7P!J48Ee;(N=zq$EGTEgydja-KM(}$`DrIwNTLOgE^vY;m^o`Ii~(0@50
zDOJ&yWmcrMF7AXaahDD0;ERQVgZ+_777v&IvL8~ibBm_iO@H#-(6Mo5iv8&nG<{sU
z)F>*^p|&UWS$~xzBo}tmX7RDzKLAg~&*Y8($DEqO(In4IgVJ~Dh`~^*HAZHxz|wps
zk{XS$Crtti<@~CWMA|FSj`?p-TOYFHFU%QZ_!dE6#eRvg>}=A3s}U0H_Yl;Tuyzp}
zQdi=!ry<Z<c=LH!H7}kx$%n0q+xUT<i%OW8LeKRCLcY5DCsEAo2oW7OZ7(<<<>P$k
z?KS6Xscd7IJfz<Ensbj0-9>xg;PIA0$US;?4@e%5-Ip#b7f$WK8LPp5r_Fex8_S1M
z7`ivGW$M8nH78<w*hLUQ2ijuk(ry%HTLRPVi}h31OQ<1uvi@2-ysOzDs!?jlX`1uC
zfAOmb4l$b-jrzTa6BVeu07z5=0Nzrj^W_Ow-VzJss8}zB>N7!@5zR~<c35Qbv~wc}
z=PK+D1;Z4s{p^P>FT;UnVyH9$>4#sCRb4jD=^m!tIR*qn=eFj0!;0OHb)AthhcX)6
zjuA**L@_}v7$XZaO1rczS}$f@ulYD+AfDBPZx6=2v9A(&PHV_1RE@6%R153kgkW<#
z01IM%oni!zIz_^&8%=U0#RWpixz5;}n<la?7#zltY3)bOk*-}RJB9ZRy~om(Mm-gR
z26r)GWkER4G>U^#8Klh<unF{G(14JbZtAXOXO?2=ycRyo=oy?TBcEw*jlHtlGZe10
z32Hr@(#|iFN(~qt)l#d!v&eXJhfCn)KL9|6{q9mZRRKtxrmpMbCCeTnm=|j*-uT9}
zi!p6APj&P%@(-w%%b=NkSK}&s%W6&IukH5(DxX2)qoFCK7Jsd02WRDbu!d5*sXxVB
za^|<MCBX|HW!ZX5ZSzjU+AHx&SKV{fjorCdgFe+Wx8A*^=XlvzU{hL7P3hiJY9mSC
zlX^}~_74z;L2=(xoN1|e@Rm3JSNzqGOodb7UDGyWuTU8U;pZmDAl2cp9l-)6IGd*G
ztzuS>Z9B@NJ#~`Jo+AJFoC3{vnXB>WJ}nS-z)@u(Ei1s?U*~)7A6i^M%brCC+xQ3F
zo02)|-Ya3?gQ=RWEI+N5kuQHaDN<#6udO9>Ny=yw8;cXqoyXHtw8~7a&XBivr(EB!
zKjcsS#DASNPYr>Tg4wD5^Lo7XIkvT`(~bAi)`f*#*v{kl|CXNJ={#_|(|*#iSmQDB
z4^a1v&Nh4T@L>0JNchh(x%S~9?WuO_`GiSZ%)SEO&ox_g{G!}MS-uHnM)Alm(k#T#
z=4fnX<>%Nn?Ozh_3V5Qa?QB_wUz|H}XyscGYimE`opFaL$p#2s_!xirT%j`)6Y3nw
z0{*76RpGC9r87N;RoecsqWGf@GW{xcpx#-hXhlGtX8#{xu{Q73Hy68n!bivV_JLAs
ziMfAtaC-F#Kcj3t^Sv%l$s0V4IQz8>T%!Ua;g9vx91ljVfgbzLcg?vzf9je_o3t16
z-Wi?={62l6tdG##eVFBa9wB;k-9qe%+!86%1X9KEr|RlhE!3JY+-kO_^P+Wpq9*va
zZs1-<(Yfd|dKD{KTD6gm>%XMC=l=i<LG!+7{?BRC_;L9R*FOfvx@fV{v7d5B{{UXM
z@8=NNqJz&1jC*9&QdYYrtG$txIXmXS+}>KmJ@Rw>%~evPQDg%tyI}X@{Bu{AA1s7?
z+qW_NIQ==QyH+-725^~d`kYZ9iK`5lwa7hNulfG~dZ@`eJhI4sRNOY7Z%@*ti*sqg
zM{~jS8%Amsk1R9Fr_2Yh`OQ_^Ng3MAo-zw#I3tzF>Hc$8%a1UCJK+BSpUSJeeqQFs
zIl%P>pN3fa4u2u*`qd#aPm&Ze0fHABvCwrqdXrFxSjxWF+(r~&<a+%*%|`N*At#kl
z!0t(>Mv*pVnL$<i#Pi3Xt5PAN$|08mINayvJ5&ID4o@W2xNPLNc591*$D+i)Fi&=4
z$u(TwNf!s@kmKe4=6!$K_U-kirOceFb?SH=dy;y7HKLK5MX4?A5)a;P+IeK$06*^@
zrl{OrX{d`Il@LI3sn9X?+B*JJ+lzxE6jEhua!<;D{d%9n6;Qm<lG0wP1JuTwPtXC+
z=4twZK{f5fl5DiHkryB9HW7jN;<IN<`D9`tcOySEj(~ny{&i+~qJ>FzX;}0Q!GAnf
zWb>`y^2_;4*uXb&@3>KN(8-O`#Ko3&UhJ{_x%31Ml`V|YHL-1gD)E&Z{VA~9TBCs!
z(+L-*@09-l<F%<Al1cK+Qbim9<0tSVniYy{)$r_Yu{%CKW6)KNLkfR{t94Q;eKItT
zG3DeQ>c{%jT9mvp?Ew4stB%JlS9fy^%3^;q7UX{jt$Q6&ZsRe^yI6N4IR60aS3h+t
zbPh*7xT=C_A(duEB0zJRSGLA?I{T|w?iHRRwsHwyn}^45f1&2N$=FC9Foqf9Zc(})
zr=?tjQCXlRl)C-rjZO&1b@v^rJxKkcE$3^>sg;#S`qcG5?FznYpxCJd<gUgeJqCIo
zV@l)#Km>Y&+Nk-NInO=Mtxx9psgL<+2OsSZPpwR3QX3<4Rhtad*PE_sBYd?}MKoHE
zfm0!>>e#74s35eamvA*y%d}#b%)-?Sw6EHoyCSK1+|-LEYdJd**)tl6n+G(k!x^g5
zNHA%}Wkj4wNMx#Beml|Ud)A6Zwk??ltvMvB23l2==9`#U)DnzTGRO$3hFnx;Vrw?S
z!b)<~%+ne2)P<^BGm-{8)77}B*nHFjgI;<(;$YP%HC!5yHxz=v?bfSEYG&rEN^wF!
zLg$)jYHPPO9{y?Aq>G4hMOX)lr4w;h194Rt*wQPgNNNL8zG?<Fp>60Ynvi?b5LBRX
zP|U?qPXdq%QPzMg$5W1#UOJdJy=FsT!lNRjV^I@KR}@r$)FG+rYCx28NWy?=tkp<p
zV^RY_G?dyCCsUf3bgHO%s{rvtMun%U<n*Q~!L2<#!$_*o2L#seSJ2TRwpKICFYeU+
z4<fsLGS#AmM5q<Ls!cM|B}F0)m8*cG01EAeK2&WKK_DP;ip{!W0mlc{v}A5iTBA5q
zoYYL{iOk<DnIHi9{t$VrY3$xrW?5P(2V9IF$JV-g+f|SftIw@rTw63paee{*F5if(
zS~?T#Wk&_#5w>{4ciM8UKAoy%x!4*r7zKKxkGg$EJ9~<_VU5>!1-lRie@?W`j7Ud`
z9CY2*oyU8PhIt)E03$pr4CClWU*}fQno+lQ)*Xqe%VHUu%Kh7SQb_#A^%Q1U=UC@a
zpW$^pgYW7+>4>%t8+$62Uf|WB0bQYp$jGX4A#`aMg1nRJr@!Y`ZCJgyW?!BxsUF|f
zkOHf#ql|`Ajmmi+l}@OWFb{=&*xon?>N|Z!F;x}UkKH~-b5pbhW&wFt8B^;(3^t7z
zbZqho!5s}tY~%$DzcP#!>$qo;{uMkdSxi#oE^w!zT1K~TGbDWKHjqA?F!k+8uBa2}
z#`#-f+O4=_gPx<Ppb(1~$-zBGPX7Sbr%kfLxEz(>spxs*^vL{aBPh`Dy}`iknoSKd
z2T-udfOC_^Ju0lL%Q!qPPCrlatv@Uk+n&7R^Zu2cJYXv-G5LWR_dn!Tb4A6Ow=;Qd
zh2v=J_!Cw%*ob)){lNMB2pInWKGjVS&od(rodFp3^!zJI1p?HL?o?!Y0LG@8ySa0A
zRxj?%2PcP@anGj|>wpSKI5-*hI6Xh{+M)K4NFd}nLG}LtIjsl(097;OoUqTf1#F$n
z8nMbq%{SXY8B)EEU-QK}Fv3OwiDg3n0FVGwHk;U7C<GQzgCpLoS&=o(%$fO45j`+D
zKl%-G*y(P>T3jzB^FGpkY>f8f{{XLAy1;2!RY^GA{(m7+=&@|HS=*ku&#5>g^35rf
zo5<D3BxfT%PfzEX(JMpL&Q9~cVUm1`$PNmRPi%jmtyPXds8sc1pRHJti$!o$?qy{I
z-{r}y%U$xwq!7wDB=>CK56u3RocdU}^)?qF5HFz^9{3pjD$Gy?W&j<i2TtVn{{TLw
zqlPWquR)G|g<DoTaJ*-bf30aBGdU5-7%#ggXgL9UpIV0C26x&HV;J=UuCFAM%7Os*
zIq6YKnUiB-x%q${%~3Zg8-0)}L<16V0q^hp>Qc#Sk)dtU;5g%f=*RjG#;<u%!yla{
z)-Cr`_xJjC_o`19S!CSBn}N{C=c&(NI%cgaYt)O}s9TRVP2xQ8RAuBoi~clypp*r`
z$MD<6c>GUu??lYfJFz2Z0020`gVWemSzWMIMgbd`^uQFZI}eiOix~0&?iZ)$RBo=t
zqkN<^%rTzGJx};k<u4RIX5X0o)$RD7=lND-w(eyjSNV_3?0Cnp^ri_AJ;)>(0dMZ1
z>V3^IB)YiDENdEeW;oB#cBlD)7#O44y_tqtsf^PfnbT<OPu@RYN}|!FWFm-wUvqa*
zD`{d^RgeyzhPhZK3fbF$d;0!0zo*=@kfnByN}Je;CUmw(<jT`--6YCP4m&VBkMSPW
zJ%VL&PBZC7p=zE}H+*qg!8;t)PG0)h#MulAk(DE%9@UuUugd;gyi(<%CnOF^pYIQU
z=UKNP#U|W>2+FYN!?6`D^)R^9nc<K+9zp!-+&6!{C=bd`<n}d~&q?NUQ_NM%QjM}4
za1SD)X>dByd7GD>$E{bH>8WZsi6s93Hc2@(Bob8?Vogm85!S8F<+f@oRH6P}wMkzh
zpTCMpkjK2)f29U)D!J>LYqkYiVA*6#?^DGtd)8QXCapk7BC@E=*`Upps&X;rtY<4#
z<Vwj%xH6QE!T6^ulir;qd8k())$_BV6M{`STWO?0>qGU;9OWV9vr#isOJbN?H2Hy!
z-OVXD6rJf9@rqY5k!6KP-kwb}_^Xa50z<&!f_b1(ift6ojCiV&w>4%?l}ba_nnYNJ
zLsi3Nsq(c(qZO)&q!r6Z_!T-nYLc}|uZpfyXB`VgsxmpI2hBWlQqhwfwK~)TgHtKu
zrHaI&lnk0`oYaV<)RM^*U{mu*1_dWMtjGpxVd9<Mn#QRh8J=oYH4l1lp`fu~9x07T
z4-_+Yp&_I$DBD8xr>M<S8$v4J)S&TABZ@$9YC#jl_rW7I*Jv?9#2=Bph~~K%n{xr4
zYpBwPlE7qc@9phflo_O6r)Z3*fD8(2N<C_<avh)JQM`t_B{QlqC6Y{<s_oX1rZ~k`
zl5bKg7b8@N=Vlql^Q_6F`HtmW)tifez4KK8^G4=is(No9<eG-fVpfrts8k2GFf&Z^
z3miNH*@q;3eX7|=iP^9g>`wsx9cfYvwK-6MJ2Y4YPxGcSDC3f0%u)s&(R28ZYMJcC
z#APN$+tfbw1%3MTq-$wJq1j1R<g1_LPkOTrq<hPBkv{e~C)|$z09v9*<Fc9}k|QZ>
zVZmkjz53I{E}>>vTmi$V=V|;pkN&k-R0;B98R!oq>FrVZIk@v9A!Gv*MtI2e6b7SA
zOsqs)dvw9;`qc4&>SJZY2Fj7f4ND&7KvjuV{uksCM2iX9Lb8H!xej*c)Ou3TE;D8(
zRmmH_fP;+vdsNX#>&U_9Di2Tp08!~omNL+lc@&U-`kn#pPs<ef^O5(!=O674>q_=x
z-3G*s_onTmk`GnE{$iKQ*(vA)3><U^+LYu4e1<B1W9q##O)a<(58cjwosZ%Csrd+G
zj8P7A(;(Fww%X-^8!^s5n5%`38ZrSOoO8x&I&5KhJ5LzT=~zvg!4XRjmwc0``6OJi
z2arJ={#ZYSCG3j~SRW@Ok)EF4(x;64jTdxcFnu~7>-mbz)pu-FAZ`dZeiS_-+AV6*
zWD9%dNUEo}Y<&(ZpqgE}Sz(Nk&*lFB)~-WMQ!LMr!Icl?&Obq2P1)O{x}1Z62ewcD
z0A7k)HY&+oBQhvdMDh+cw$u4%`I?3m@^ovt{ULpzae`Q{^{K7dTHVtiG>l0dh{s>^
z=}ftdu8k$XLo*`&Y(K3Q*5TxA>F7Sdha;-Drg-$MY1pmwu21@<BzENRKT=QOTd^oj
zJYh&L7y}2`Rt~bs3qqWSor}xQJ4Xt?i6`~+tD5&c2!p(skh4u?b>)q&Lu}8fP(P(E
zodJd=W&|uFEuN*Zz#l?IJ57;%#0qhT^A0+ZkMYG^*lg_NE%$~oo_XZ{ILSX;R3&wO
z#*NzOu^R*n3Xi%zZM`$ztSD2vc5+8<!=^t<p$Rd72Mlq$J-xqL(6-BD9FvfEqB`zO
zv?WMBbR+^7qK@o;z)=h-xZKD~MyKTf<9mOH^d9wUHCbGMFgeR-pzeRCy(G$BKoA8)
zUzE2X4_|VB!nRAPi+u|7G9B{F127rdIKciD&u{5f^4?QTC<V6gsxSxB*j138a*c@l
zbMlf$C%Hf8pQSc(trCC>5AQeeAE$riLj-0?6M12njnBzaK*{Z&YO4*C&gC}~yOv?;
z_5Qu9UG1Jp_V?=Lx{q)1sOQ+Bp^898aK!%r5znun^rVK_iF(S9h}Uxux=#v#cl<kd
z&1TAy5JY>IcUIey$NVd!dqwiemn05wJJxaYT%VUEjsgx90QEi3wF6AXOGW^&Cx3UC
z?mv*=RLg8v9%fzh(Nt&r1!(=5iw(*Uo`m)Kf!pe7t8EzJGTq5yNorwpIK!4X*n{1-
zAJVln>vRMkynUAyEOy`!Ku|uFqi(1$Lvun`6K0*D+Moei4Fb&;4^vt2KTKAPQX1M8
zFrKG3s_3VdMF;6zoXA#2w37n|DBYF4K8C$r&^%;U3*tKsk`s<P<k421oXF>rIVZ{z
z7!XGU9@NEPYUR{sL=tjXe6Pn}LsWJZAZEFe1+ge>(mHOW1DccqX^lv%(fd%VgT-0c
zNv5)5nk_O&`cj&SSoEipni~c<42q4#NDmYh$e~Dr6kcl53{w?&sZ>&sNfRN+rpeNp
z4h=i4W`!NdcGOP0TQy{WDhZDqRU@L~RM1XpBG?rhN>-hR7D&ex7;svnk;xRu$sEvK
z%OuhRkxVa24-^>4vRaco(+g0i*12MECQ7v^=}{$2I#sgBWEE0n;MLX~)GnE(g<~<I
zC9~S2BNd|2;8j^7^sP~>6^fwYs&zT2wh-LZs;9MEdKyXSJrt8x6>untI#k;+SvFwE
zg7Z>=PDts+aZzzxatgAGP(&)qQe?5)c&M0Ttyt|)$yt&_d8V3CN>??kQ9}SUy(!^w
zNW;AnLK8gF5k@E{6p|_^a+;ZVs1l?N?@mK#lygYf6oIJk(>|3cU5K}57elvmo`cig
zyDb(4C^H!czH6J%?H)^j>|76VUFEb$k%9oO`cbmEtRtx-C{@6!lH^sX8mY=#r%F<1
zR8@-3>N<l^^85}*etJ{gGs|#2eQGBl3=Wy-Dm_h68wCV}C6B9fkD#irZVaC@F=Ofh
z&013#enw3!fk+@m%lOh|BD_)T;2i$|vHt+pt4$OVVWf@t0}UTh^&RR6(OmF@IqZ3>
zs)a^;vIk?I{{USHgB*}VP%Li62<T2h{{RX>4$Nhlmlzr69^Xop4$uie!#zh@m4d$`
zBmJCpswAqrSLHcn9XRD_?F!4W*Co3UdHg+UK@el|oa3=LBAl@W$N++R<ls;QDhl#k
zqjqoe)})Yd7@;J01CPLpn8mzxInHtpb4?M1<P(o>qww^kG*tpbZZVF7>FJN@(xy+G
zfT~gZW2fQ%6h1d#g;VtYpW#grVRrz#eF5YD0IgAXa_p=;t474~q;u_1%B-YIAAA+`
z<FWke>XQ7BLym-Zs3cH|LCKA~4l!C=LNBSAcHl-Oi{%7hkMrr%@~TlvvdmPTUyywd
zU*}e?zD9UlfQ&eP-pBm=)@_(?JjBOuC3^B0bovlMtR<|GymX0sU}tOsRo9$$&-oRb
zs-7?j+{jm_Q(E(H3UPvR7&k%d{xzGbL|Ig+<$){p<DPx0q_idvQ#VX+^sRw#!aReI
zdfJvkEG0(gYK(nwYn-xU6M2MTnbd90Ff+#<mALW`vX^Gc?hJV923XY7*t)xvtw~)%
zNY|A?$K_MYGCk}u927WH$3j(!{-1~ySsaVhAH_6o#QJt0){%?HeQvos<jSrGr#${<
zn!1QRS-l?Rf&^yeNEz*c?f6z@@+6XdqG9(S-N5aS>P`pNu13JPQgAnQ_X8aMAJVC7
zVPb(3k{z}c9Y{bhzf#2oX#Nv8{6so?he>h6k@I_d56ITf+11uP-!M~_J(ub4$N9x`
z8hnJ@Fj;qDh-Oj4Hhy9~he7nNwlTI9L?CgT_Fmu5rDIF6rCk}8vhBD9u&(EY1BDf*
zYQ|>?9~mUE^d6?MH93|!#B2VroE2Vnf$RAGxE-rf%$Va_bRfuo)^or=-TXWI{U~1*
zg)fHYz+iU4ZUY{yTR8fk=qka881QhzsUTx_{{UK`g~V8i6MJCio~EaeJALv#fY(ay
zbFm-G35h;#+d}@Hn5wvUEXcv%yN-%IGu!Z}XSzi@R0cnw_Vp%|Nb(K`KU@LqDjb%@
za=phm3m(LjAnrovpRG1KR6&p!w`OCEew8{~p00r7pev9lk}?#MOJly@=qaLEPGYP!
zq79whn*-_wDH;em6C#XpgOYzrkhu+&j41x|a(@r-6p}0K+!S(oXW)G)+zJ;E+qOV*
zqqsjVe-WSNDx)lL@Gaag_lHCH)`XBdj1~ZQ!0S+$OdP?GM^-13`BHA<L?dY<F_1Cd
zrCh1V>FG@l3uZ+>R2clnPvukX!fsKO{_y_*JWzzmp_#wC_|}v%oD9}V=chGd7iIOT
zOqPwLlOnOT)k7XisruH03{Em?rmtr6!gkiGO`{n^;AbGpz>pO1YJ)~qh&@5YYTj79
z1eP^MRn8kdct2Xl)Xl3RxI$Y3tSopGM7cFN;8Zgt^&{8-sNWqbLe&c&nyDKmFrXaM
zbInLk%TN}<q3Slt17f5`$*2PZQ^d6*M7&dlCfZ(<TH|9}`cu)eDl^HUPdKP*#&1SQ
z-6{zeAc~tbgk)2TG*T$w=A(7ZT`}dU3^>hI%_7QzGl~p&t3f%<Bd-*g<jN9oY0BJD
zGMs}=2B{-CGSC?`{kRk;#XA{l<VPKWc&SyYJYtk)sg05WlSs=*q>6c9M-(7L4z*H2
zR^&MqRmU~FO`=Aq4M`jv=7`rOuEifTrpc2T99T6cms5!8+iM2Qn<3eOQI)BmiijMV
z7OY5maZ^S)s7D5-PAb_=4<{8AoYJ|)M9;-%ZAq});i>&<27YPi-K#`^oOGlkB7si>
zG?>U3VAN-mR%A?yghoeN5oi^0NuF^}>BTc{TPB7kJTb*AQkdopG5F9p9@T2sPd84x
z4h=XbeF8e2W{rFbC}j;@DoPnR7_1E&NBce+Ndq5B=t6~D0bTdd>4If>AFWqn4stlD
zrXA{|$U1ZQRt)GvWXL>Y=}`Ga+m!>{ig``^{KM-?p_jHt)~4m80u~&Azg*H4OksdN
zqMl!9<P+>EqF_!6F!awft~7!;;zPj#t*GPg4t-RLhCU7pZ9neSg;vH^R{C+xR7D{(
zw;&J+@6YQ(mdWP?bs+Ih$>c~6)|wRw7}|dkR7miw!*}XNX~75wE5YlGe=N|E?ci=+
z{2H7B4o(lgX{0NV1@5HeQZl*fFnvcuR!H5!VV^^h{V59ik_hY16gY=5QKMooFvHMx
zs1+U~xdVmzXCB6{H<(W+Bh*v`6=8?idnw21{{ZW#aV<z{vbQU=cR1{6Wnkdv?;qBe
z4UzXtXQBT9Jbn~Pn~<o^dhI{cHKJM)-lso&!Ltcv&+zB5J&&>ebwf=iS(zWH!32&8
z91Q(&kHW5M=^Mnv@Z&jePyV%M=oc}`B*<h1i3Pnw4*vj_GwY1kH`_XX#m<Gc$sPbX
zWd%?9{{TGJUZxJxH;@M603O^5*I=J13KW8J2W<EJ>nB()%7sS^FdH0>D`{+F4~XUx
zR!f%1A2V>?r2ha))s{jmnj^+T4^znh01xL`Vkc>%a!x~V3C;&>dsW%oODeLj%JHcD
zNF($h*E}z&(>q-gDk{vFW0wFD2VuebedAEw%v)5k;PEVnk9Q;dDp30+A#uWw$PvwD
zeW3pUY1xsS1Z;!b=LfHT->q!)iJH}$;zF0FbE_bH^am$D{bBjhs!Bhz(aSk)#~kD4
zIXV0RC+ZfcJ4)7zaJb&eFfoi0O*dMeQ5~!Z2}aAauVaosK2Ok6zT!&l5oxC0Nj9Qj
zZrTS-V0Nyy*>Mncz){ft6~*YPk-DG<EZA%g)!f={Sb=l{9ICP44%w)pRej1=Ht^Wn
zmk3Pn^A;VwG3nI%^Ts;VRwv6MJ-@zTh6~Yu+5Z3^%hs>zsuI`DXZ`6T7&tut057j!
zO3JZyR@$Uzxg2NM4$J)N=B&{WcTDMRV^xflBc7~#(`F<l#UQhP5$XQ`*QK{oP!?gH
z%tc5@X8ADZ`_gA0m1}JbZ>dQvLlEppQ^_FUnzbB201yxdrfF3qke2fcj@<E|U@B<g
zz{v;OoC?iFT(>9GV+U>q^&L8M{{Ysgj0)U3@5ew+YSRoKnUF7h9MbMpIZ)63e+smS
zxm<={yBI<0asGcQVFZk^+}*G`eJW>^x9|n`-ZNEM8-C+3!R^!eP{9i7fOC!B=Rc)W
zkT@SJaC(wOT~&1>c_e!e@u=h^gSWS(A(`xj+YcE0Y2q}5<xU&hnv-B8vuE+D(iFkM
zlkc2V+^bx$p(OjH1L@fPDn*QE?^b+!cV`FltI<ebgZI5pf2~q?C94;qJ4oiQ5xz6t
zv!m;d^`#yP;<Sr1k~wW>PiZp409Pq28#g9%-n}CJ-^`4ITy_1Qmnk7>NXl{0#8I}9
z+XAMNHd1#CRcQzVn!@^=$qg(3$*S)t&sv7!Vmclwy2x18h}joB&w67V^HG9vPgA>!
z3P@DRHF6`;t0xR9<aIJbXq<GY+*81&wK#~33QXat#+uzIk|siyr>`{%`qZjxGIk%f
zowp*Kqau`y3Rlz=tVYN&P%x`JnvK_*cQocR=3G^LRoT}fs%<`%ab;TQz6TVlywtmB
z#szYtiLx>&F-pYdl}CECE8I{ikKUxH<jyG!s?uSwX}wPrj=5@dXhGwpZ5C2w@g^$V
zOM_6wR*`BZ5@exxq##q&QZVAMGHj<WT8VfyETEc+M>V7oaVAyCser(wMZl+#RM~8=
zDdMWL8n-B_j;AyrNRPcrl#3o}RE&dG^b<t!jEb<S#X}NeuST2-$%!HlD>T=K9M$hE
z3S+MnBCKZ>=cOjfu<3(LBADz&u(9H{w8%!`nF5N;jB|n3wzRtmCVcUV(vs9^S)FBs
zXs9rA?^nne;;LEMn4<xXD%*UkU5{gUqMXMSRP4@s)v1np)_mC+EBV$e-4mDja&d~U
zBX8(&RZYVw%VhlxL~fc!z>FyMV^Jn)7?w`(+?M<*MP|lJ06ot&RzG>O>^Q5DgN?up
z`*3PeM6q%P&|DtjxT{FA4ns%n$3LxCiC7%5V1L>*bS_sUllXsHxRD|jGW6WP_thXI
z@Ji>~nwJVk2k{*$Bd`O4N%aScWRDpGzCRj~TqyaAkHDI&z$os_dm4?OV9bQ5?~_)9
z(V!Vv{pRPfrnpm(u6XyXnYD{sY=@I>J$U2ssx9K1GRMoAN$k>sqw}hAk5MQ~p}V{X
z$C!BPGyWo!qk)AP9-XTh{6++U9t(+EyIZi31N7aKQ^Bji$8n#_W1b{Nk3Ud<oh=wz
zOLF*-uMHTD&tuTjnVFY6dFg?Qt@fCuLApd`BO!xl_4KOp%3{FW%!BxW`=k6Q*f%qE
zl_2g7hhO2(U#Iw0T`x|GYp=D%Bc;NqZ>b-`N1++P9{&JJxv1@)0t1F8rs7BPs`@;E
z5<<a*sQANo2a)yX^{lM9?rx;{4eg*ZXOZ0JkSjM)h%_EsbAykl_N(gZ(DfMy8SH&3
zjrki{vy!6#AM@M#)ih-nsmMO`c!>mNL(gN*Kb2`d-!|NUaIq7C{{Ve+`Tn%qm=)e2
zqiFlX^CbTODv_af0RI5fE>1Iy4nL)51#6p1SFsFi8tLDHk%9_?{5d%OMyTDu!u|jO
z$C&DI=uX^zXEmiAs;rlClZAIx>Cg<E{$$f`;SaUg<7;o@u_WjDkIJ=b=#?umw=xHY
z$^dvPxRNu|j-Rb$Y8M}Bl1UV>F2xWa;Z%G5D#oDh&WxWTlwQ5C2|v!MM)8Rx2YMt+
z<)OeOh~zKRIXUT?=bo(E7Dm34=C0p4$vNGy@CTqjLDst4R%TGo_nQRcx1g?POoB93
z50o3XF&{C`I{yGKO6l#vW*%Nu7mdy7=s(7)lv$lwCdn@C8fMD24i8bBd*k}{tcW9q
z8MdZdpIq)I)YjeI)Nw${#|4ft)Sqk`#)jf{n6WGde(J73`mySNPfFTp8A43lg-AeI
zOS$zp1OEW6L6MG7I%hw1dB@>Y((7_<JKG(8V}bR>O1F?j56vWTuW|+l^ZvBfsEt!C
zqe+316nDwT{{UL2QyE}eV|)Jq`s)F;+ZeYoodW0Y$luHV0M%9_)*z7;i}1raJFsX;
z4cyxTGCn|`&YJvn4Ufkap1OoB(cxk24OvT=vGQ$Sp=JJ6S`(<OM^MSn(Ek8hZ=TEd
zV$Z%np>CDe<YaDo33L3#96zf48#S9L3Fimlml^at8jeB<IYtM+YLg7T3o!SjW(>Gs
zK<({FqdHa#_esxegT+#~XWO{T_+qrCaS_KSzCr3MF6QzR{0_u_bW}*RK^*6K<EMJm
zf?z<+VLZFOU`mfdDp;p0j2!)HV(d91q>aHN=I7tNX+YTmxyw1rumo1^yYMS`vlfeT
zAZ0a?djxw<R<;9iRj#916xB6yP9pv|n2<QEJBYEIb6tJrlo^Qvv!6+0#@fP3+`-dr
zoP@9vPDM5=a>s#PY0w@+_iDmu8wLXNO&Tg}<VCAD0ePpjvqU+rz-ZoD{oZO8(b`tY
zUMkAvIR%Z2;|8xH6&%+3=x26z@ln2(#rPZwhjBQW2RNpXde*yX#CE7&#fji?LUtpO
zvoO<0z{#u9T9q6vKwx9kP~F7h%A+LlQ^um4w(XvkVmnqns#_9+BTSRUP32@}t;1)5
zk}6XHCjeBB*t0jwz^K|1dsfof5_(kWrvubfyO$aa=9<x%bg2Hs-`zFW8x;eALs%$Y
zm1^edk;=CjsMyn()GXZADDn3KX+h$nCz=5@Rw$G;HTe{Rk*Ee&ag$R<pyHdiH8ev9
z6w|Pl<j9z-C^1b%sW#CgOI&yz=^LdvM-+ugtgJ(cT1Cx9iOpLnii>S(i;5hyH8*xC
z3sPk<jyqIrl_5OR<22$Zj6~+A5rImhigxT%lPQa|VyA_LRu4+AwC1xBp~fmUrmUG1
z7N!JJ4AccqYIijsTC`CmyhWrOS7oGI5gOoQ-nor9D3ESn{=Ifq4e|i0K(2~P+LaSb
ze5=;0%O(X#bB;|@o$;P(<j$C)wDSAbbhBiY-Ph8sOxftds7yj0qOxq+A*`ycmXsX-
z01j!NXk6d`F-;*@^cnT11Oaj-!1m2UMWRKx4fluSD%4w#%%G3>6;y$k$&I12+}5OO
zxEbf@aYRr_mLYi+0b`$R)S;we3lOW=0h3U9>kz;g_a>9=MYvp(>(Eu1*t)@S&g1x0
zPO6K%ug%zh^{R{A>;$vt9kOv$C)6f<u-->}bg6X`DN}5{G|r#J$^Pf{tX*2@#;utf
zrgPu$AO5Oi-OTPhypz*CaY3+ViOh#{9QDO0%b!BrRCQ;bYnsf~kUYx}`H#xd63zY<
zhZd!JmdkfN#%?9%0IQ9I{(5@nu>AcirSV;iM^9;H2g!u)a5|Ho6n%PqGhASm?qo7a
zjm^#gz#f2m4wcfO7{>M`&9|XtH4CrexVY<*M5G$i`$nf6YEto~1Z7c)Vt*0O>scUS
zG7mKy-Oq0m#}uxp!vt=}y(seD%uUGIvGK&Vb00R{kx$-e<Y&*m)<4MCWn<#FC6qw3
zDRF@Ni8w#qAoRz7f2DZ!i!xowbjOIkNP+$-I*-70{Jm)vo+(U<?622wJpOf3rk2FU
zT>GByHRzT<G7<pJ8FR{?>J2gr*vy5Mwy^^seRG_8`q!QKW5n+Bt)1%(kU~m&pJQEF
z-30rYxxos<qYk(};QnH=wxIMn$gcr;qX!3YU>E3rtw$uwG{{4Ldk5Q$*F9&bI+&E?
z$XE=4+ZgBbHKA`hNopKFAd$yjrz81QGBz(NDV+mjDuiR->q?-4p}&ST>EE1V^%Th5
zJ4G^_=Q;Y~ts)k=X&eoO0*rS%s8I&NZG&3KiSutevFV<1RIcIQZU(}qYXOgKr~GTC
zir5F=)HXjtYK`<NA1np|VUMV-B^?DB>T(E-Rhm$Ga8D$1SM?^Ng2ZllW!&4@<T>m!
z$JZwnywL5Dvw^+F3I70iW~oa4TE>HsjifiOFu%_=k0GzLELr9+owk-F@t%Mjb^ic9
zm8JcpakX0q9P^xig>usB2Vv!i3xSTLo&`a7sXHqwouR%^0psiXev~aXA?kEDdZb_#
z8xS`Hmcc*Q@~rgMp-_JA#Pu8xObX>DpD5QC3{o{8jz7=;0A9I0TV0y?`A`!Ly6^xz
zx4AUbT$s_`-~2$fv|@?@W&BC{dJ5;SJZEu!a*(D!+gzM|Pxx08C;B=wj57hBQJT=P
zz9^xwv@jU}4!wBvtyMK^h0=CL*wpXT86q&T&niPB{y^aVAC)#Q6nShK>raV*>d06V
z_2e3%bkR#YG-Zk$U>a<V<|79In2ZKJe?IjNFujXaBzgvk;)^FmF-ssr{4CpmKZwY#
zy5CTaKnTFy$YeOr(2DU53f<(AL>a&*73dmQ+HBxcCS-w+%zL5rALp9Nad3J)3(`vO
zbI>BVE`Dd;eOLm2I<S%|@JUnc#d2`!h9>g&41vmmKQZbLTGEG5-bPO!m1ob~9W$U?
zkN`Pj?~Kwtz-)|j^sMOT7#v`GQ;dL~G70U_)uT3vWt%I8WeR;y@)bhrV5bBr<BSTO
z{Ikgl8`LN{s2j;DRmdFiniwM<L!6dyp7k7e3BPPxn;rXBo7*h88wcf3EsvD>2NgCc
zvqHhoaa;D%aacC-F~G^iYukdV25Q!XGg%uVm6Q=x<9?jglsT;vGia$Q3V;an&MNJ~
z{JE@Y+@l;CndoaxCSL^yKq_QKJmRsE=LeHklK27Evgb8)%@Yc@Y78g`097@;2OQMN
zKI7J_oe<#1Gzv!oqHARXrCHt$E;CNzv7Xk7dsHuA2*xX4U>wsJQ?O`ZzM4HMvuR+S
zoL5D#R8jy1CV@pW5BoVl=}ucUQ_{3pQyD<(P|XxkPstrIQlttg%zY^VIp(J2@}hM`
z>?%PPYADW4T2`in)rJVb>qzOIf~Np=G#p}?Cz?G%%bKvlm=~JC9tt4<(uL-y37TIo
zj+LQEgS`Z1rZHlTqNR<ZB~MzIB>+^YxuhhCA?brtgv~Kmik>nkTACyqKop<~QfL(H
zQ$#YZ4MKWVw_U=kNSLOHv0cqZUWqa)A(BlnW4C0~sA-D4Q@T(nlWxT@y)$(tY1pyF
z=vgvqCp1P5X;?+nb5d@|sE~0_98$0WCWQOcq*BCyxaOKeH+0sCO63DLrFC}93UHf6
zU}-QQP;r{*)PREn71<SNnn4Ve@G6Y0oc60S=BUir2Njf=)f7{>BeAMYa8sZSk7{+y
zrzBQn%QU<rZ62Ad+>O#Lo0!~u+f@4sjhr#aEC-;fkyuN(ZgIEQoSLyDk=ud>N$*2p
zVnw;wK*`VhMy^A5BLR%#@fFTJ@M(bB<Y%sUtAB1g1~Lb`0aBRUO}Dy)01GL0_s{aD
zZWCdA!}Q1?ezlO7U<~j1h{vF$^GlvgKsQoY3LLIA8yg17S(tfp05QV!AXT{Z`2ylc
zEV%42b6NUz>>-GbNSqF&Fh9>Vx915HK3nz1Mn9qEw1gwMoaZH>O>E+fWt1*>z!*Q)
zl-nqTq+z)ne7O~1tcQg2zJP820QJ&{Sd+Ye-)zLzv5H$5%1xeg;|oJJmvd^X^93i4
zK|ZzS)&Na!f-{bz(!H<579$0NDb7LL85#co>(`1aO{ME|LozgI>~WEding1JE;^UI
zfD<Hz_RUe27HM`U%Q4yqsjc|qNfTi!mN^89%fEt4GnFIfQUUt)`c@I%<@p#bx9yT5
z5MqNUr~Ax#X8cd-igatcig%RG<N?V!HKA>4Vvo#`K74$De>&)t?nGhAjF}Y2F6AXy
z9FJpOf#LmE`%=L-ureUp!1V|G^IRq2z$Lgfr{S4e%K99!Rhxz${i$-^Hyhr^a$s+k
zNdExV;DSN*&-AO1lOrT*avAaO*R43h&uqa)BqL$K>T%cdH7daH%+3J$co;e72m1d2
zFM5MEYFo1xK%+l4Fg-E<0M@N&NGbw=c7S;4(?8a;t>f7d$>kJh9CpQPTSu?}y~fk+
z$NvCaRisNcAOp%3LjEGfu({4gJ#o!wqHO?n0~HkMxer{9mAj4Aj8v@7UhJrpf;cL~
z{$L;HpTe*;Yp*5U2nAPw-FfJLophRvUo+$dWys^d9Y^7c<!@p0%*wfPBT@kFa6$a|
zHO|(i>6s+DI5Hl0=j)D#`qLst@=`}5=I!m%HELKHBs`J{+A-<Wf1hfjt6Vjefs+Gv
zz@z=|TAN&FHLvPIYlCSEw$=OiJbf!C%i6zpCZn|*=56OKxRQNqXh5$9^%-UR!mX}d
z2APF*88+^Hvr}0_O}u-Cc&Vid6@dWmQYrSx8CEsN$^fS2?pQfrJ<BU^l^ahK=8UK~
z908tccwr!9gMrO9@f_`rCH>^2Dvx@`I?Bb^*6@v}ds${HoW2L=UV<z(TPE0x4?;hY
ztep=@`xT|7n2nA{nArV49{&K1b}btuDjp%XjynGUpU$+Fq~6CrCX!~iA+5=H@=)aT
z?lbz2#;QiL+qTmaf#`ExtWn0u3}7pEAnvMn)^cH=%ai`uw?CH^&nkD+)-`%F5)0;u
zx!f0_$m8{`IH!fvK48J>0jyidp=|#7-NESV)AOo=S)l+J!CuE5D?7NFSxD$E1X<jG
za5%vtsI310H&OWWRK?_F2xsJf4QJh6`AkBo+x_5r)n4TCSn5UDDgb3T_Ub9c%j1!X
z;$zh%5`3Y7&wAOm)FlHTfTy)PiNNXY<U!T{01D8K;X5#{Ij-g>f}ncU$S$08U`;-B
ziS}n)lY@?NR^pR5!Oe3v)5zE)*43<;12ie!icqpNiQK%_KA|4pm@R9m$f}oU#Z7Ix
zA}-A49!wKan}R#lo5TT+t!KB=xn`_(#oZSox;Yhk*7Q8GN6us0pTuUc)rhCVKbLy{
z0PV&P<VgOtnrw1N>tdaF$*W5!Ju91r@DeiBqYdJL*0hYwr2}7Zqpebo<99i!#GF>D
zMou;)ViZ+o1-_Lp-ZbsG6<4^nDKhQmp=PVHpv6^a@sfH}G-(!B$flE;M<aFx3XI)r
zJDQ3g9V$hX)P7zwnt)u!(#aa+w0Ozw%`HPh_u39|nw1}QXDjoXu^gQBp-3D~A%!#!
zY0P=07^z3jp}<suMk%XLrcoOTW-3+`&^H=s7h$ToH9DpR2)H#NIHb^+gNjvb8g6N&
zu|g^(kigRCny<f#a=v(`#@S>rsHFoGA*T_MOh$Z>^H3GRtMac}sLRa+whO?flTQ^J
z^`J?MjwvxfJYtjzcP`=;ry#+lq{tLpo0F;HotlAOqz02HaeQ%4f^Cj7(x856V;MbZ
zp`EUiDzXd;-BZU}<g{@&9hSOivKNzGN?IF0S&tQ&aCkMPCrZt^Qgd0z>4A?q<nzT+
z^J5Q;oc${F>N0rEQe^~?7n;Gdp&)69UKp|VtofvA81CSD=cQSe7TSJoze=RAgC-hK
z2Yi|?Qb=n6x%solUA0s^JF;aZhgN@-Rm+=D>Ohf*_7$3_ut|l4wmnTDrfEcrAlriD
zsl{kP8X)<$?mxmYRkY_bN1dBfIL9B(uDn}vcLm4rf-1RmGji6&_-05~a4WZ=1$q4V
zt8v}L;9y{S@%}Z6hTteMgd@IlpVFd{h}>DQ-|UXG?$E5B=Tsz*IpqQVB1RAEN`@&p
zGVlKC<AGdgx|J9on;%}adI_W`Tgiz|da&#M6;Z6VWXhJ8M?rTCbHTIBa0tN#xUL&n
z@Z2}?Nerl~F#D1Y59eKNwxMn$2;w!Aj;+W)n54V4iuJJ3#;yJ3;<o1Mh>g~a@!i)t
zEzyGdAbG;e&@lc$`&9Q+!E(Dm7$Z5Zt6uP(%&JUeOoWry<~i-3O5=xzFW|d)f;2Jh
z$Z-5;>5Nd}8EY3}(`ol0hT7aozdWz47n5zvoMC#L^Hzt2^&PGLoUAdD6*>Ist)GQ2
zuWq)ryrM4VLz2hX4k;*Ed(3ZFqA&*Q`Mk}i%Lli5x{D-taJIs}TaGb`^qn)nQzCD0
z?m*~!W80vl)O<Z00hPe{dSjZ?om$BWC4E`Frb3rC@HoZ`1O9#MMXUo#5f=yn+CQ28
zL-VRW7))BK1pqI;4r`-pb}1|LE&`5v`hFepQx|k&HD+W-0clF|WBbJ8(||u5R>VjZ
z!Ny4(aon1U-U5uf7a(ISPY3*u>sm2F0rRoA94Z6ew1jSFCt_d;L+wnJOAnXUtg7}H
z&$ThvBn8K){OhNoiO%Y_%_JwwVhQAW<X1jyf|Ybo2R%J92nX=UuF~!yyBl+p^DnXD
zvE2%$)+8`IoOR}~oK@~=7VL8umS8fs91P(1=LhsP#dy+hDdL12jpdskPBZz}rd~&w
zU5bu6;AXt5#c@66s8T*zNmUK`;8t<m+9>51m~I4%_fH15EN4uDa6uf@(Rg<Ct(ERA
zK#Y#R=cQJcN|wkMSHxgshXnEU6;o<^GUFDI+lxhz{_7a`r`cK%!*TD<Dzq0a#{dQ&
z?6Cg0t@}MUR=bHjojmQ&-*d<5SvXH()R7Fa`4POVM%3q%(zZM;qCLgu_NJnRVL(J`
z!>Q|xdY?|i>smU0gOUQb+O(oc17#sh4Dfkg!=C(~rFKx;wb_LQx<zXk{nScE2d;ho
zwA54;-sIl?gqm%y&esz-4&387z^!EkRC#TmsOwRMjBs8+20l=Fei)|QU114{6|>Ng
z*&f{u9;R<sp=TjfW);cwQ%rY^lA9N9IRt-P)*PD5YQ<zov)Jt*{{Tv7_KQk$9BQN8
zjb`SHP-V-%vkY%=8~wgNBV5+1>~rNS@y2){{x!1Kg~<dls<KHT$k`E7{obSN>r?R-
zpDD?-vJRQgq0KS;$-pvYaay-=6Cuj&Ur;(zVzqL9f1FhE*uG;jC`Rxy3H2haqDvpk
z#tmEk%SHL)-IM$|rx+PHagWC|IE_s*7()}uKSNq}-zyFQsNl6nAeSbzp|p$vf@?c5
zwp+GfK;sp$ZO~R^5QQKBO>00f2NiKfb7q81#X3M8X_1kNbMh;pY+=;H)Gfk-O6IU8
zkSQX%d&MUsHG_9(GDxhdF2`Lb+|Fx^bQKUK<qlf}D*!!v{{TvRz}fVw^2SwiSoIw%
z4Vk4$D8b9IVJ>#@n!63=I@V(ZjGPRSO%k?q(xIyw=+V-_eoso$g6DQJYmv9OImK$*
zTpW&QQfP@l>LZXfOdB<m1;#+;t_g<GgIy4v&S^w^%mq@AvsKJztw!Mi=M_&-PQ_$|
zV<wTCZvvey!m8j^Q2Whh?2Q%J>15j*HhK!7Zy(Cd+@4KOdck5?!R3yBDs&(vs*Gfk
zO=R8Y%45%Z$#x?7fvF>Nnq!lXN@_PWo`+rTb0l*{DpIt~rl~XLlNCDDpi>b@H9k11
zD20Y;SLLY!(iG=3(P@Iv)V&83#%u~15@IuAp)Ho7nuHlSsku>l5)PEB!KVeM1*y0;
zJsO{%T8kJo@N-s}l!nPrGS!9|6%iB(vb%9n9%^?L7Ac7_gRMPi3(Y(&OJLa2YBtEJ
zSW|Y;n3E$`Ak^Ebfhp>=+cdaltXr!Y1{RnUdRB#u5^n`insC*Wjm<o=Mh4s;O5C?s
zU<*|%3t0++%Zk}Zwy!nVqI5;*K_Y-FKI6S<OVX&wh0Xvbvyr?EYlPs|WUN)Nqb=A0
zT`O9qGa(rN06prQ^TltKf;G>tPyW4U$<@0XDRwZ$t;>SUqA}g1?N;E^?GTN}&5-v4
zfkR0&@AHv>dJdIhIT=RO{Do4CNtBsZ@l^p-U5Yxf>sgNg19t7J>PIK&D`HsZk+KF&
zQ<}zh<(RkV1xtt~Lu%0(Lgxq3m=^y4>sG{3B#ZK*f!vk<02<D;v1A0HP)X$N=ku*4
zgJNT0^(+rR{c34D1e(5sp58JK%vawaR&CX@FWr;PPeK95=gnD__8fQgZfYX$FIG$r
z+2iswuC^VznDgQ?N0eDlU4y6iQ>~^%I1);@=MTsp;-$FL_c3HKsq``xAC*^FriH#;
zqz-eOkL6P0*=k7Hj!`NuBu&5G3<~}$#a)MhIp5zX*&jj<4l54a4x@56bA>;xZCa=?
zgD;fn*LjZwenyD4f>vgNmh#8SnLWdDY6)!PowFny0gc$l<=(79k-ss_G2aq#{EbeL
zV*!G>J<E^eD_h@lEmp-8(rpL@qsURv@CVkdT3EpwtE6fMFuZlCj1f2=B9Wd0o+`qq
zg>uIS`^+#ZqLDIeCJ(m=pbu8y53O=q+)We`?Q{;vLh*nz>0M0Hhb*eH9CAP$oc{nX
z=Um3VjDRx*M<?YC^JD)2uSDf-Q0XhX9LI)rTZnFan}oxR<Yy-!<AL70=w|bQX;(Pv
zy;O0}<^DD24HuQFd4Yz~!*em<XE@2PL(yDa+AM>iJ4wa>$7KWFrkib=){08y#|<e`
zRN=ZH<B#e80O}p<V%`v{tj?r?fUD3`Ev-~dxULU>{=I6Vxg4lb=nZz$gT2lf$!t|H
zjiii#Mm|;P{uN$GnTX6l0pE5AAJ?^Oh{kh_nubXBp#T7Vx>n5@+{cm}kD0?292{~*
zVBgwHzk0D?pEd_OhI#yPkF9mmh6(d-+wI3nrz~;1zXT41<A7^9#`+YcV};Z&QV3li
z10D0uJw<R@d+GY7vQhKr$1j+J2MwHeuA|0DEO#>72T0>Qu1ExDx9V$~()0<&qXpf(
z?Gh`aH_P)f;1a*gR}+^jcRS$=#^<P8>5w&=$08%654^RFci~89-jBCyAD52R(AxQt
zMZB<R*l^%rA96b%TCj=omCCUrIQ04-N_sV^joNx1dOjaoLn6r=@z)(k<y%(TNsN5c
zZQIZV?Oh|=xKpqJ><<+}Xj?x!V2+BTH7&K&(MseIX|N0vnKArbe@@k8m5l9obWVVP
zf2C8{p-W}=XNLf&AC_u2ymVYFtfX`Ij|bP9j&z60*=~E|Z^B;64nF9{e!n+0LTy&<
zl<tvFAI22%?nO|V_1I(?r6lB&$fzL1&Ige3pYLp758^5vM$1r0oxJOpAL(!-9&sCU
z`kJ#In%omM?_JGJiZqWPh}WKTCTec`g91K-0+oW~av*`fz4XDV&e_f)LG8#jJ;afy
z&fa}RQI~rRuHRFUO<fbC7jOstTYG|YR$#-oF_1fPP{k2oH*O!TU5z#<JMqtTtr9bA
zl-QY<jIYwE%ce+;#F69uyo$Qlep2wvarsqv^$8@9f-(u{+=^|*Ek_P+gQC-?z$0n@
z0Ca;_qPCGf@IRGFZ45GC`C=pLr;5;yCya3Am)A9)B6}J+GB_dw9AMSx(S;&29{&KP
zSw?R7{Hl^hQeeG0Q;8VsPvnY)_jW&qD^x)Y=owB0VqM0{PXG^dT9Y_A1&I7>F6u?;
z)}nSdr*kG?xjAw46;Xng3NlGJ8LP9EBZ19Om0NIO)EtWEZ0K?3i%dz_#6s;Ez~t3e
zl#|U_yKguNSAn^Q_5QU<hB}(fu)|B5nq<*Qx20W%=z=RKG0z6Az@?Z}GohoPww$&(
zt&4XU9<_;M9x<BeEmd$ZD@tcQOA~Hva4Hvsu*v4H1ny{L7~;B8Ir*$($!t$2ii>Ge
zfm*U6fyt^9l~rc!22DdqwJzpu%Wj>IWy1deQ&$moz^Xdqn#@egARpdr{(RN*BZFM@
z+H!I^@ZR*LF|tN_lTo69-iLBeCYWRk(wwY!#KlpQOShVGigi)*19tOLfyFaC(}PnT
z#oJ9`QZHIY9MN*FLPlyC22EPqQMzK2V%)2?o{D`cK$$f#kBXau8jI^s#wp=2DH@%I
z*up7>pPEk8$he^7Q(0*`Q-SPgY%V~{QI)F>DiS%Ra7=2gDHPx_N<~e<vMf`!(t*u8
z6z&LUT5^C-N^lvZAc82k`HOb;rbh%<Tcz7ZiZTXkl(Uumzy)=>Xpn$NuG~6C(04YI
z0ALDJlSrHnY5DZ7rxT$vyT`^&Rh9OPD65ilRV4vOtzkV4Vr5*Ha(5cfkvZozr#?wF
zLS8<#&i6W_u@Y@&2(je*R-`s*vo3b~R#3Z&*RL(tinSi0B*<GHT6&(QwTo6Wnr4ZL
z75k$<K~TvK@$%-VwxGQ?6N;@iv=Czfi0ztANJ<R4@Hju~a0gy<QF+Rz3QUK)Fc=k;
zHmqk)-V&;MDxRjKhG$h*3aC9v$sa-6>q3&~Hx{=o&lF?tGJ=2E#b&cd9Dr<<kF0G|
zNpa>j)hbEu&5~-Ywz9ug0DUTF<T&}4qkDEhyJtXpi6c||#YZW%Kv!%|2=v-V<Z6<~
zD&<T3qp%sr>zbKW-;hBAkC<?Otxq#5%ehuTvQ75QBre~=*dYG^Lsx9?rjA0=M$td_
z&|y#4wKd^T2;9gUu=&U2aZ*}AavKP+%Nq6Cb~qnHRFiB?*&A?3v#w-;+!2z&Px9up
z;fZ!`jvSyF#=>fafg8fKyAM#TtAqJ-{HtA20bQ~#@Oc6?{zQXaF{Zkl@|(E$6gNwD
z8S1H({(}^$19`+nZWQO1FO&UG@~bgMq=3;kKAFemnz<U|jqDe$G5-M9t$mfu{i8)A
z45=FgK)#0^{{SEUy>mL9gtsy=Wf%i2$G@<y<I4;VLw!iaLe?nc78wBK1KiY9px%hO
zb*%1rn`lxA0(l|YM&>?;r>X2X$I#btq-pWmTgD<kBCZD{n$SxnfHSrO9+>Js&-JG%
z-^X0{9+gz6sOrM4XFU--ah;%U<Iq(5Mmy7D91QS%>BQqAx{^7=oM-f=mCjGSJ3RZ*
zkVbpZ12$ym12szKA^{;XGti#3d>kAaWXrt>9B03!4GQ|6apL<p<iE5GjmRv{6FJ&C
zdUMwta6Re^3rL_bvSbBZNw{YVjA!uvRo%QX%yvqFn>jcnbgEHksSw=~obK*>dRHA<
zO}m|x>ZKbpyf8}`3=hr@9Y=HOJ!)Wqn`kI|l_Lxfw|dhSVPe4<0AL(;^dhQHY~U3H
zNHf4;mS3l>OQ&Q?nrx!#th}qE^dLBI(}Psaq-I>ROSE(*7ay%^q;CHJD?2_qf(Qff
zAk;S!TEvGUHa*E80r?7_W|}YBTSE%-WMHUeBeL`Qj8+}Z&&*4bn|KPqbNbfR_3e{x
z21Ll`<Qwt$=DGHk2@82~?iu+=kFVxUVNwYj!dD^_yD=^z+wGD7BSGB0e+sn?(1i#O
zFl=<{KOSfd1YvW4e)b~+^{HCbCs4-YVCj`l{=G$?LAJV(O|3{ekO1j~$)x*hTmV{L
zdw%Q+c(I8IBIy|WuQde52p21~x3cD)rPP+!LgJuh94mU{QZg<|v<K>=kLOv3S#$s@
zC)aIHYpq8q#DD|rF;{d$i`>zOsN)1xo#>+y9zAnVE5|fs#OVBTGlN-^T{XLvP)0ou
z_|w=*H!s|^yn_hjei_X^($NkNB-KlMnP*{<f$jxt+Q%9GK7O^Fnv*q4l@_3YE-L(J
zM@o(efWhGV=C2?CbBd&l<i;sdaLa@0Ds>q5r!EM^9#%O9tcC%xO>zjQ5&iBDtwAVp
z&}3Di6mHNrLs*wsLPkw%-8lq;F<94akVqWXRSueE2^>{dk%cA9?1b=YnZHcn^{!^l
zsG@bx*;xnhoc{nytm%`<s}}P4ti)p*NamOqc*zwDvCoC~d7;^nB8}Wug|OzTh`=>w
z62OcKmX6DtwqvI?w{E};3g_(|xZu{bEax0m!Pw`hu4@TY4z(N3PYX<n7CmZV2wt_&
zn`b_Vr6F>1O=oHn6^KXO&OZv;byM3l&*~C+my7`7F|>V6Wm)pUvnuoDru@cQM$a)S
zKIp4bLm=jxVB#nXW0pUlsaof8<W$j$P%jM^8fd2Rw^N){TY_~JZd;Lz5mjWB9Agwo
zBd#pWLr$iFiW#V9%>gv3Y9e{3FG>><Wa!+~olQWdrc=cY*ro(h70o{x6y;tkE@2`i
zOjLn!PB;}CJr@SWW5qy8r7cmGtrO5qifGL=nkk5Ob3hb;Q^2WVVXnCq5ngJ+=~1w$
zxDn!tQ%VH_8BPL_ie8mKE-^sh@k*-Ba6dY+8l-q6^Xh+~tBhM6i+XfTxEU9(LEP6{
zW}qM#$F*}-R}H)cUcLL8=<PQZ73k5jS{)S8rmMzDr)RZ7q=D9?L5@vm-PzKWkl8pD
zN^FzTt;*F(QciJL)HI1sPzFz0&7Zl(D^1fqd8!vdRCD+W=aM&rG21?FasL3<soFvl
zF5PL8FiuZX*i}hx8Oa;S1EQ#?S{l7XmUIWH`eLg{;fKq)v)cetwC|2sf%F{HWVsmH
zpnXTnOO=nB#aEU{<PYVHk8GdA{A#TC^Bz9NC2q^+F#blYO+MmDU_Px?9##VjFF)^M
zsb~$S(4GxK^p(x6sxdqT-+}e#n!LKCgaGizjB|x6lkM|o6^<qHqcTY*UHBPefGJ?P
zNgw77Cq0Q3uIS0Gr$G*|TjlZzP&$&wAC785TI7QAvS9ESk^1vl@d#wb5p(WIKhCBR
zN~aL{N>5^o{c3E*bZCn!jla2V*d0{l{)hSsk|{3Xak@2Q{@hqUf#Wq+*3$7xma~RI
z{pDs-e=qQ@Yc#_y=q_MAep!nt{KaVq>vTk<tZCY}nHsdB9!K10@t@Ba{3~wOD9mAe
zwm9Gnar)I8aUQ@-vc|`|XCI|#!^(m7g(v-x!~83@3%ScCT8bc}0XX!+{{TT#BF3C5
zs{I8(n~w?!^(~%%TC%D}0So-ARA+JFPf^FzRDfr%CXqlLs!ZTx`eK-dI2(F#RLUyw
zIu0u7IrOTMow*D15zZ(YB#Dp84l(WhY2^H)j`-%Hjkp-e_Z-x!+q>{-j8MgUWKo`&
z<Qk1+`95QVj!6{$Ks_;v02`79diqmj&dzwxr640ahBKd9S0}A8GOUpnWdX7Q&rD{l
zMhXqQVDzZaWx@2TN(cDnronN$K^Y^GIjXZqw>jK$I)E}i1J<puP<;<-ts!ijFvsVL
znJosHP?-q%2I2nFWBH1uaD<~0k;y0UsrgiXA!|xhSrcg}Pq%8$xsG6jBE+gYm1g|;
zsBtMs=CueQmOw62Hh#Mz5-|RPs5}lMD->qlA^qWiKRzpF?&HZ|B)E}D>aj!e*E^}r
z1c!%tTy;=b5%mCpT$K_!VA~@r&leC%dSs|y%ZyY^FJf$?06P8PL8=kYEMw%2K_{e{
zKdIub#XRHYHvRc3I(}8pYn$G~n(E~9Ap_sH@~oXg?b%P3L;8T3tMc6!f89t>{{VHc
zr}L`LWRPLqhtTG!CPy+bHkNJ1IRVd5D%=8C$Y5Bn(zOMwilBj!*NmD$f%26fT9*?=
zG4R|Cxfuj?+t#f4o^SyC$KI<df__EM(W^$%4dfODPoWhW>{Zb<q**vCk9yg+w3h&?
z<WkyM0s#PiCbeR>WnMV?RYo1m<uW^XnQ^=kR@DwOx}2q#hU8R=z>WoIk&|~L8<g>a
zJ*q{U1D+{a1~}rQJC}|Ksu4Ee<F!_qunAVd6c*ZeHCE-e<=FhBe8c+FPTPafrFSYy
z3OU-{jbPl$@Ud6qdivG7=O7#$spxAKTa4ru8SV{bHks2)RdhO8quZPVRL_uKB?Tia
za&WD+y$RscO}j7Nu6E9-p}~NaJeCX2NXB!Dvm|Pgx|56^K9y21sKJ;Lpz~K{i_c}1
zKnEL09@Q7soUv*cmY>=}AWZblU4;V>cGPfN8qkUrRpii>&Zi>G1r&4@r38TE7_7uZ
zQPi5d0Ry0`S{X}ISsdb=Mpvy=S#Ubj{?MRuYh=tC9L=y2M*w8kEj*ju-}}s3x34cF
z`#ghi+*I_gcwwbLdJ;4JYnu}8)KSNYyOM#k9k`C#A^br`ezhAajw!NSGv^2Js_nd<
zwWaK=k4iDJu=x}aamlJ|LLZn?$R!6M_^70EMh{w+Xo|1FsE=B(SFJ$es>jT<Ct7e6
zgQYoYY)V9mIH?694l-$b)JUrmuQc<(rXiu;vq%OzR4m+5$n^=yrneP^CZjFQDa}TD
z*2xvhKXpA#L7IA+VPeA5#Y8FK@k+vDjYdT&ZYiYHIFcb0P)z{PkXQ(%3*7K2xyL4&
z%v-6ZwA@6EARI6M09uhrEs|<6Ql@i^R;k>yjcq-~;l*_pvJIlSdlbnP(^;kzU~^rx
zo3YbP7u4i&QUifaRUmUpYik>j$(H(6ck}|SFgWI`NZWq3gpp~L<-#vLYcAk!B%0D-
zTO^(;q_O8Gpsbp?q+N_jlLVgiojbbloO{;n&~@UdOKYA;{3||IYNU>SM3Av<z*T8u
zY<>17wx)n5Ey?t%uLB>Lt~~`DrE?hp92_s>nt6fSt~2UuL2a3jPpw#30*r0RrlL!z
zVZD)d$wKGy6+Bvb7%zao-R)ZeG1Wj6;U?3F)1OmRp<-P^D@dh6Jh*`azRo{0TTuDf
z`JGefNA#@afBER!kEL6NQ=G1G>~UJhLn%9#gGldzb0&QOAIh~XCcO*><SYK#;QnH=
z<C<K8B@5dNT2|6$hGI{t>-h@W4^t^@Y$UQV{{T{l)PtIxWgz_9RDYjSRAP-A7!99N
z4O&Q}L6%oz?l}Exp&2sbKH-HU=nj9CSd2F8<d4G@RU`ug?t}0C6=fI^@{k9wTG0|&
z4bM5vJ9ao3s=_R=VpyKTH8$`UAJ&+KcK~%Col?4+V=v0PTY^d8eQH^N*|-)S+mAJq
z75@OAJ6p?zXwM)LPBX;~l@w@2GRCMk<%rJ}WIrbZ4Owq+nJx(GdG$Z%G@|}8+XG<Z
z1N{0@F5Qh?!;Ih@@I@ivfL<|yS&e*~NMZau`ucw?(`VG<NmVzE#EdtmPQL!s$+I-_
z6q0GKz^Uf|=cQvdw+EO|x~T&M5srHC?N<`+In(59D|7UrG!89pTY#uydC#?41fKb<
zNbixHd2%Zp^vTcV+O(mNIL1`{C?S^H1P+x1fCJFxl37e=j&s(aYmmbneL>AJ5zOi`
zHn1bt{{XL4C6JN#BxOFb59v}|OOSBOk^R^I02<4Zbn4m2?~s3$Ln#FC+DyMPu7BPD
zkHWI9ba+@2kh>@SGmtp{0KQKn@T>Dn9l~LVf6$NXSyMc4xdmk&gti59O(R&gWZhcY
z+qohs&O44jPPIl!EyCn1tLRLdf2Cc#xO0#bgWNH!8SN#KLdCKP?_BLAvAk~gCz9z<
z5)s&Y$_+vNr4hjLi1g(Clt*l;dh_qiOt#Um;YNKiQ0`UYh9;y{zE!~W=B=#Wcm<F*
z<Z(=n!!7cuU#B9pEv#HN9T(V&T9tG~6}p$miyq@5wyiDLsLvmcYHLeL7a3fN)`lNc
zIir^Phd5fnB4CFER+M?~!KkB9oRP&?R1M12Bxc>k9S<3%GJkfJzF)k!q}(ao1oH^%
z`P6S99MjvL)f1^8jC~CV2&8bsKdn`gP0Ce|IN;NsMumuF;P({-_;uWeBhrG>GUbR8
z@g7vyKYe7;Ny~0u;bGpjFXm{8bR-aZ8iL;T3-ZYL^FN7CM)$5di;?ND({x`l);&z9
z?YHrtrC5m`E(yr=G+kS@teF`xlfm?<VUx^WzcBqpW}Wss<25MrErpUKXdHpG?anI1
zFtjS81gr85N6GwYu*mByJDrNJVtDj4uc|mPe&t9^x&)ntJS^SCXhac*O1T}pYymY%
zT<mhOmlH@2O3;UXI%2LY5e|0L!X+SJ0al><nYopcGTa(_TLu^{im(#~hF*GBi5c9$
z)Zthq3XFMNx8iCGNDR|7hwiTf>rT|9-FV|Y4nHcpVHp~HXBaQ@etkbm=1ZQd?2amb
zysqT>5VDw$12r2G-npyJgbqzWRH(=wjcnuC^puK{BV&rEDP?X}uE@lm4M4*=$<0eN
zMP1pbo2@iTDOH0S-Nwl9oCgJ`W4$3@yii*;nVNwn{0vdLq1}_kNUBJr;u47}H4c3#
zm^q|wl@gAG@{BywPZVdGTvkz9f<sL+6(Lhewz`rnq6Qq(ldVWsj+BbL(9sPs<24}8
zDHK!4#R(GP$c-f{Q)*m7!-X{QOS+Md6)6tGN$p5EpwK$d(40r8=B-7M{5h)1YRVH;
z7h;k%Y?uZa#dKEFDuIr*%vpk3x@#B)139kxE!f%yf^L%swMnxbX;G9e0H>VeHPOmz
za<rW)la0;KHF{2Jv0PR&j-^GF**$+MhD6V~t1P6NX5UZ3v*jz8q|j%ns`9`O3N!Co
z{{Utu@8JjgU4JgM42ueI734o%qxzb!W*p`Owu%ns<kM42l5dzpD}&G~=QX|qATA?@
zGwK&O{7!15(it1)c_#LDA%7}8np=lEC=DV)LbPiiqP)rv<<_M-O{_!r_mM^F0W5zl
zwIMHppzU7a2GROeS3#PqMGEbaPX?Bth~&JLy`xlL%$)v}T(+Et0?Oh+{?*7I$!fzW
z83XgEjAK96r{pv$+-d3ebco6S0H%{O)tI$d@HXkzqaKi{{$i;ZfuUo?3QYx(yKk#c
z{1HPR`>25b0O&QPZ+(3!0VF5iAOZOST;-}WN&p3z^ilp5(Az^KL<bmu?q}xwt5`|1
zIV5XDmyAaHoO<9@kqIAqW@@J#lE%0yKNCaWkTAmo>x$@28J1*)<mn>?_UTL!It+QA
zDD>izE46;iGt;$Br5}@2gfcnUj0IvpomdE13;?Io9Moj42@XPztPLt$CU${<c*QUo
zFfpNKf2(fBD~i-^{JiRsqh>RX8ztlS9kJJ*wcJjz$^e&<*b2whETEd%)DOHdo%j`<
zOQm#aO{?m8$NVMo>Junc*r<8!o;^Evt1GQG(%bft&PQtD{AHm{q}(*t*Do625w*6Q
z)g24OlFGu;{{Rf<g_sO(_HXg7RZ>q|o~1a!t622-JW40E%1ki8G;V_+k6+4&H95J<
z<&JXM;l84{S?~6IirZJ(tXgV}gzj&UbCPPKTU#>T9VK*jGhIRk=4x)A;xTJ<_U|_E
z!zelv+XLxdZGGZ<tw!_gx@wh(b)2qA`VPXo9|~wQ&2pDlo2G1KOLbqRTdA&xHCV+*
zQ_%DaMsnAaW6pEvK?Alcr6Zx;$o*=al8CLsg4_&xkxrG0?Jh<S@hx;iW6hSVv$$>f
zmybi*p`U7@DT2eMYBKIu1ExDs8h31C)OM;7og^~c0vbm87_<FFXWUy`T72?}3H1sT
ze=}Vx7}ZD5`f*TB=*hK$x*t(c<8h{Qe`>WneTqqa_o=G4T9u>#@6q`6bN>L=tD2pZ
z02YolUilnXH8suLE&Gd^P!C5x%cXP4wYj26l1btV5J*i5#Gm$?Kgd)++mOq_@b;K9
z-w*yYtEpZ$6Mdd>{{T&`(Z5_~t;H-ZM*S|t`>L{kCaAspeZ^YECAfk{{{WCKoVoN8
z5B-|0!XaWeX;y>KhLisQimeD@v6BI;UO#x|Kh#yo?JUpTl4L%TAS3ezj#uO^Vzdbc
zL47t1dWj?-)~}>?rx>tB_YPX6iq9YF<cY_iY<`BWMQG!nEr+%<R-@!AkVdigf;IYv
z{{R}aoiZYe@EQKKGQtmR8dirKfz2lo+-kl!F`)kdfYLK<ISPY6?9+n`;GTUADcR7R
zW}&81WL|pobYH_XjPVYre*;M>75aMAVY;g*Bh*$+wYj3O@7*x@b5TCn`P|t50CzMp
z;N*0m9E={7De5w`{!5N!QJ+w2J>mczvqnMx0BaRu_Acsq^sHIr*iLirO}lh7itItC
zS<NbvfgkAO{A-<>%JwNlk=wX&(62SMGi39~HH&to8*-eA!fi9AntK}dlN-xzlf=sS
z!z&DCqH8}Oh6L>=gXvi|((RR97=|9Ex)?6EW1Xx==Ugw`dTesZPYEt%@V~&W5wX2`
zcOI1<HqFT6KhCr~=~3{=j;FamKb2R#!I2qMC?gzqtCgDOOnA1cQRYU)q85!<9-_3N
ziH|3xW@&OO+W=fE1Kzdc+&JKmP;**Ebv1<fsis&&6(XSZrXgPDmx0l%WJ@O61xF!O
zQq8~vG}8F*RJA$8c5lxu@`|o&E1Ajirp|75GfdvD+kX;kplkbB)nh@zjicx*8%~T*
zew&9142Ss-=UP(PqjQien-6<AcQV39-Ew_MC)5`|D9u7|%!`7D@~IlokGMIitrY0T
zI3l%oJJp40QNis{825C3De^L&9S&-^8yJ8{tl6S5M2)`{W;t<CGLYO=n4!oS>s^x6
z@}z~{#M8?Vy(-jppssOGdu%Tvp+v^TD1K-5tE-H0P{#%&>BU<S;**ICi10H|u{B}G
z6&Ax%;h<Q6o@znbn<Qj&PfkFrlywBP9k)FNCc{<{IH&o1bgs1>4IEX9yJ@>{YRODf
zHiBvL+?ygb>rdO7aF3dlM^vem$Wvsfj|Q*2dF@r+6O7W8Fm%ef=AXqhsRtCJrDEV_
z;5%;>6Py}(YK4wkmnbL_BUQ)=HAx3GZXrU3r6?O#Wj6tlT@|q#KnA(JJ`K(}=DMpp
zZ<J=cscc$SE<&RowJQvY6+(;(c@@%@f@N1)gr-ga$*XPv``?hLBx8(p9)h97E>~nw
za?PLiSNv(4+oAx2{hq&{T6qNfRBpmC#YL<;jFEy3tJKmYVLoT&y*(+3o%@g8Jw2*M
z8TmyoF_1_Mt)9Z6Xn^fnA~30eP`r8hK;+Qk7c*WWLB&GQ5D7J|&|rI14FgC%Rmk<E
z;ukTV9`zrR-FsH~(@_9xnSyun&~9V?6rO6^{{Y9dOBw$A0MGjHKLJaVxcQ98beqYO
zf0QDN{(4Gwuj|&O`#zg&e|bAyIQ|&&6n|6sid%yu=X}o6A6IPC;nD6cFn0NA*K>~F
zPQTKfmql>(BmU7~JgZm)N_!_6`s0)MlUmwjZX;6I8~Q4659QXW<4x1f6^+FGK2RLz
z+xSQRJZZ0}T=~0Lt*-ty0%V>20mnaw$||FExSqQmD2aMG!Sn-~VYXKLi};h$vQtnH
zFo)$50rLiJ%s#l_A8u;eYA)Q6Saccx01DReBFvskR7K>HPi5ovsFfHlN~lhsH%e8%
ziH|2O-_oA*#@{f;@9#P3{(Y*Ix(H+3zTl^5QR!1hDgrPtPXir)3aYHE3n?r+zCS@o
zK|!2j>rUWm<y3Ra9>TIUJC%hAoaC=0DL5X3{Pm?5jHl(xIUKR<D+^JcS!dhHT;-3o
zWnNzHh0|K<b3Q7!ZBoU@mORpOr*BVT@7q6LO7myZ8pia)Alg9cJAEtGO~VPxDyeLF
zVe!BxuHVCoz}NJ#smAwE<sK1?H}MX5`qv~|XKW^-&LZ+*=HstwNhDXbc=uy>2kB8y
z8!f<64svU6MDYBV_X_d`STo7*_)!(9**b|tx#1rVu6))|a@(*+KZSb6lWMRgLXnOu
z2TzG$i)e@jPfoSF9nf_cZNv-^c*ytv02=CqQq=Nl)N*Fh$jU}a^y4G1(9_I{#H4_A
z$ie2an)mlx8Fw#noDuyqRNCQ*>db)hz;ZsKwz@_ZxzK&8D6@`n?Nx4XgJ3F;#2zaw
zz>T&*O9RG39DntVU4=efx!e!%p1=Kn!j53E5BfvNl-R)b546<ANzrj4$cO!^pJE`-
z8+(J>{3+gWh-Nn1>IvZg07_P4F?9$ekpnF63wCgRT>JV~6tcWhn6(R4k-xk~$Na|!
z@-^4Zd1ojwOo<zwIXEAVI(=|!D&pO3*h3-P5&N;+#!7#{Bu<r_<JAo_Dmxo1fIN$n
z6Mu5yarrRg`BeV^*>@Jo8RBi7nR#E#aaPk!gpJ7*!F^$MnSY`D>cz4xx&6(|$bB|r
z^v!3=x|MRYF>zO7OmIb8MvhVQh@ATEDAgc>KFsKQ2N@rjt5FrmQ9<B}wHVq)(~qSB
z+>xAT9oL$2f({SoPSz1^U9o~s<5GjWt`Ff=V>mS^$mAT;xC-&B4D27BR*_TgkmKBQ
zR>S9~2hyW(G03TuQyOJFi1GWuk46TV@fZ1{+JD_OG>VwUYLuu`l{l;<t-3W#A9{V|
zBBC;(1UK`hJkzKlWjO3}R3^WjPtP1|J=gk*&A6>W$*s#Xk>!w1M`Kubm;~pLYU5w7
zK6_VCd!zpVjZ~E_E(Qg|p*^HdKQUQ4)ZUjW6`;2<CN8XYoQlt$LO}#qK`xgmQqf5i
zfO<YZol}=joq-Q*{{T=kS+w4#bSg#c&Us}5UU;p0X}73c^Vs@T8>=%5eDS_LN}7GI
z%0}Afnz7LNXB#te+VCU!Q}?S^*5KJQ$Ije1`gf{pYPWEJcyaez(xbN2&85K8uZ9)Q
z(8soW{VE|FY0&cJ2SK<$f&K=Diy)DnGJ9lJ^h(Rn8s#*pzRjqrIq1a?@~+4*1LYN^
zHF4o*`@UYo(42EpF4NwkErCq?Kta;CH?_+_MPcjmfYHP|DPxa*D?{yy+z@LIQn-#;
zoaZ=feE<~|TI6;)u`Q?;G@D}3%`dA2U%+$vnzYb5pS{g3jv%p#7~!$F{ZBOcBva7S
zrxc)(wI$00i1R9rlq`hjate-Hhe7g;imunW2O|cl=xnaZriaZ0g#xHOqh#dqPHWd6
zbmpLv@vw4EO+&G2=6XU4XBB!Ya5=?A8lOtP82M|i$CW8+J_P{a)U!l3^r%Q+K^0+}
zF@k#2anO~iSs@rS;TgwDa<B)Ql}N#(mf+%0nlcSFBP8=zWC4vc8(OKr5-ZCG9V)+<
zwzXYDEmDziYnjH(IY+_B#Z9v#r9%<mRh4Yk$>=8{tFI=Ssot!zb4U;^X$XoV5i?Q*
zspCM^DT51|)YUSPk0zs$Fe>p(4n<a<g*1kp%&9=!YO_cIsb@TzsCnd8PCAOolI{ka
ziMZ8e3|g<j82}nwg>s?k4;<BqA<JT)8s@KC#sM62O;Zb4)YC22Fc^%JTai91D%<5y
zJNj0X1ChIv=sSIDc=kltynvp6T5~qh$LCDlwQ_2-vX(d%t?WgvIouOb^6gPb;YKP*
zPs^HHj>1Xuj8#Vknyo)I3F4wah}Ch=dVGAgYBF=c>U}DBlq&rEdk#$`40ARq(Zzwe
z&I#?_qw@(kQG$3F>T1r_iGO*^e{g?Vv>~N850yYs)!(3~-S+VjCBNsRb^HfvSCF{N
zV`%6HN^-zS8?)2yj=%kH#(@-!Wt$2MG5krVeUC4-*zW$~4bAxWAB|a(SmPUaaor<5
zey#o$PI(!VV-XBpe6$>&!@tzhX>}U7tu?)&{n`hYpa^#y`?vH__|+?^<d-7qQmfa<
zi~9B-j!j1`!3ShOpy|gfe;?>69yn}(H%sPE)LVm$f4)agpy%mO<=CqU8c!^2VA*MI
zygbB__-^V?u&S+aux8&&j!!Z8X~_NK`zZbq^yl8Jzngfgad{_~ZoEdSxX-9X>G<^e
z(xBgN&Zq1)JsH=TCH4oYKadp~y}z$d^DES4g{s?@6Ed`lH#BN;5PG|I`gN^LpowLb
zzWU`<3}t<Nm-*Ivl($Hb+u&pydlAWYJsw|QVc*zP$W5{}#moJo?nlR{-{?JY`3{_(
zmmM?{)V1Z6Pc^<w{lx>5{sN=&<X{2G&m>ebU9g-+b0qoYdf|_+y%S$5DL~uL@i6Fp
zhqYDEtXlh7KonyGxE(%~AKTVe!NMTwPp7q1lI0?ZF#x*{k$t^SxAUy|WK!5>C!TRb
zoq<HyH*xJUXXVcs{wkw9s^g!gJCjspxN^Z40FHf>RXHwLFH`<C5}IWwmmkVGEJgt!
z0s7S|oJkJEJ3;A>L;nEQs`K24*ACbKVB__wH(Hv-FP5is^2hoLl-EM6%$xfNUsp%K
z`T<>30yq?7g#eOArEt%vIleOIAUIm-Z8a#^FC*^Y=lRpUnR2^I=SK^Llc+p!YDBro
z+Sm<_IjkkP4yvf#K2y)-`qYN)m<GrD-pAITv@&ov;kwx5Fv#zY{{V$bE)+K8D<Ynl
zKQQ`w)(Dx{dgre+h#bkC$DTnQ>XkxyO{cwUd5J@u^#FDHj{gA8kl)E6%x8jlBiGQ1
zq=?QybDVTFY86G!;s`xy^Ea@pTZ>V+!C6-$x#&mZO`hf2CV0qX95)?4qM&=xYE&>1
z#heTr5%fKew@RDr=_)PVoXVu}mKa9=0A+{ppV0eLO^Br(!yl4W^J4zvbzh;aTS=vY
z0<e*DY5Y*iGFSXQqN<A{F=m1`j#U2uS#El-y84PC-yDIJim>$Hikq|>nu%#?bNkyz
zFu!=;oP7^p=T2cfKnce1dKVjk`ZxaoUZRdjr6bH9TlSF$ABg_|3c9jQ5c%-v-^REh
zzP*p*&04b&3IuLID1E+R{xxbfZM<QJw<3WAQUcNZg!f#P{6|sxih9P;kXUosf#Ci$
z>;#Au_9}frsd}6^4e#EJ=aK>U?@s_Qz~}kX5gx_Q4TJAPD=$4g4{Ccyk;hR{g2~gN
zpkjG|@J0;=Onu{#>rlxc-G@wiGXDUDFtA~TaL3T_e=3j?;jvWA!#hqX(#x>yV}s~x
zI_g#AWCPop&PK6Bva*Kc;<K)g3<n_h6%@BT6?1}m8qb#Lmw@|Eq3Kvk#?gsZO-fsN
z3iTd_q>Ea!Wg>XOvv3ckWna&|fo0m&o@LxNkSmg<C^vIYtHNClv@CO>@+AdNp{QCZ
zqZv76^;2BUyv+mQT&-!tsLvnXZNjm1?eOShJRFv=WyWdFe7o7=K83dr)}Rt<$o~NA
z(slPm&+?~@O4-NH^1gbSUotl!qbc;-YF$dP-Q0y%y&Wd6^8HJe+JyfAc0~aH0JsH7
zsM*1*+Ie$u<L*d0p{l-YyK%ceQAnDD#MpG;RQqSt*Xxwzttgo>tkx2&7bo|t-}}SV
zS9Pb}toCtB+>U;txx3+Os@wC3=TY)W>Hh%Mr)avnLpRuO6d_IqKcBr+WgFcy8gqpy
zHl5S_4zqsl)Yit{l?j*2#!f-#M|x{YPaxn{_eXMRtDzclpYqRv-mp_M+{YhX?~lW&
zt$T)H1F?S9ou#9BvP+EZ8$tBXTE<T89QBtySi|OSn1ggQ#zFz8)?5%WYF{}>t&>Mp
zh#oAid8(4ymd##iS3i4-X#n!CK+%lwv}|w<LeaE?fNN=#<D43gXvf{DaV<!#8jdQ;
zfxs028(S4R9&6CC<J#=7A{%p6<~um6VSr3lb=;WmOw}YPZuMpnxK%iQam8MZ3XIhx
zO9o6FQy6BP%t@;9GhpJ7>^)x`R8rJI*f%{X>`4{RQEE(;gkB9<i<+iMQJS#?!xZCU
zELqw!Lm(#<EUQS)H)K~tU6D#9s3QPXqPa?^6u7HNAS+jW1knVG!K~|%^6^*akaJnn
zwP7b<MlH_ZO-BrDdezyZ1m==89fd-b*;la|Ktawb%AKQ;QYvIqT~0!POCEp>lUg}g
zZbgWQHGbcw?1fRmW7GctuUSaZ#Bk?(kKj|l`k&}OI<}I$gfkrAa0dVnKs!}NUCK`8
z%ri3`v4R)79;f^(UIz-G4ozWMu{?CGc!&T2S8-Uh(lb+A5w7Mmo3<`VtgS-$E=Fm&
zB(BDy-iVTGwevdFyLn5D3g+NuxtwD-&2-k0VFuAt2{|#;ii#y}l?$jGxxn=ARvQb`
zlj~I^%f1Fn6Wgh)mWD`ZW_H?xYW}D3t4k3nfq<i{9)gh}TXoF8Gyec^9-mW8camod
z)b>>!K7xg&;K%u@hY*507d=nY(vU04`w(aQ$FTaEdczzm=1$*4Q_1{lyw?pN+5koD
zi_m|Gqix0LS~Ecd1&{Z#58~*f=}|P#BDPp64!wu}0A8XBv5dc8?*9OdK{Sl`T>f1J
z4ptq92}Vgy-^6<U9+g$M$sj~lc^u#Y$tI^%XT))?=RF7YsO6Du<7lnhbFjpd4*tvN
zKb=Dv6UeqXSQtIm@k%)n`!B!0<xpE%tcB;FCM4=J%bvsWrHT^Fs-rB&fm&uHfntoO
zn&0xu5&Ud=vwM1b4@yaMdJX7QFd3O9x&^o3MFMbKb^vx?=laubCNWO93qAJ<8X(F;
z=oEA<`VV?8FCWinpl>?rPnG3g{RKS-ss2K#s4pkkZCy;P2$thNDlc#kW%X|T2OV2>
z{=cud5iCj(=5sM(KjZ4@PDv;JS;l*l(?8I272Vmja&W717id-cw#nHC`(CH|)b}&T
zVGKzMkjg)Bj5%EOm529_Vc1q|5E6XWUop2~5dQ!h{{VpR_)=}HKd<Zh>^0ncB*PQC
z{G_g5)l~Ff_nY(|T8qxYeWFk>Psp8*{{UEj^}T5>SUkQ@m`qWU^2z=7{wDr{smDLs
zZnsE2MH{yE3Of(w_||RPLXs@YEAPvaT#e38sj8-9yr1rhagfQEWAia2{c1BG?xgp?
z{eSw^E$UK4Qe3x_D*|~qC)=<1R%2f|bqB6%LRiAvk?+Z>MgYx~91cZGk)lY+y}6c2
znRj(K{LeqqsXUT>qA5!sm#Fo?z%{oup7v(iJ&jVD*A3GY`HHeKUM=E8>6XZ9d~V2h
zDLZfmK9zi1mWyuRn2hF^>{-Ca7|kwVu^qM4u|DF&@%h&3-DC#OC#_^9xl^@);<TfS
z<-qJHQX)vcvX$x3boy0+D8~HYcly;V70<A!jK^yz=W>#`>ygj$%~~^Ny502v^Nf#r
zciIKaYysp9@%-xrZWt=CC*>i59jfv(M$2&#$2^bjpHHEyQX!)3R}J@E2UEkNe-P|P
zzrAQ$yV)?77B6o65&XgT{&kx!(9z0^`>8sAu-(-AdR5ywwljRYk+^uzfAi~9o}{y&
zj3mn&#Q5zS4oV;H`;YcdBer@~pDk{9nF7s>?p|^|bNoZ~6_*$`<IISOA(M<ZL+pM3
z0R3XMZRSa1$t9CHB%hfNx&Hv!zdG4FkU=C35g8ncu?sQ`smG}54O=K0J+geu*}%d7
z06(FqS_X#%9E|nsX*k<d1j?%XugLwe{VHhWAnkBD>*#)>qeT3v&mzW*(hwC!OCG|h
zF%|-;&JI0^;+j;W4CGT|aOxdT1Y{B1denkUt_}zts<dD@HzPegX&HNpP6+uA;A$tF
zCk>EA0~tJO=zAK?lpa-ptIsOGD*d`vRrG*IyCWTltmITtPJ8*ICN|pYIxK^5{{RtH
zKGkJ?`=&BlPC6X#L;Y$SjY2GeCJuNfy=Tg*?Y=TP<E?YfZ0U>(a_P5DrJ^ZtpWcM-
z$Klqo?st61<B`~oYJKt}?^k4ye-fM;mQ7vWJYK;a(vCMc!;kn7=CaaWMyl(~=OmR{
zLXsb$s3eEy-lnwfEwr22aRuXAOnOIjK3w}a{<Q?!UG<7@mKHG|`DmXqAH-I3i$<}M
zWgxu#!||&rfSqX&gZGC@n-P)Hu{OC%L?yDP@Tp*l@-e}u?86eLBCK2P4UjS`ITxji
zw&aW~pPg8He2j+3rHrJAGQ88g<UzHma-)`|8gtzhS{xpqh^X#ueE3H=b^ES5ioEK<
zH$~LeX}Xpvj`7c?Dt(ep=Pa-{)(pR+>cv~^SKsgx>+UOJWif>)+mMWdT(#0Q@~cKx
zITMgb{#AcX)THvv7YntA1x;xyCApeaFzepf(zuZQowH8U{!1~>EOKg{)9zJO9ZgFR
zlmG^BIIPpu-iPlhxvs{LAf9SB7#SHf*umhMm}j0ityZU7(4iyX^dgZaVmLLR@Z@~z
z6neK62l_N;fLQzRC~$j?NedIm+yyO8cSw$0?mmnDG#NK81_JsH1xCjM1PW=`R=E_a
z!Lv?{s?^bfII60IHF{3RlG;p|HbJVFF&8y<G|8!jzV2~UkzB<KWGE`@Nrm83#jZIO
z9MKWps=;y$oYN!(jwoY;)|{#c&S-HOWmQsJ6>vlfaw(}8-JI2snL&!mak%tmeCdJ3
zSB=O#R*cc*Y6xKj^sKpT#bPyQYItV{nz*+9!jCKi$gPsl5*Uw#sgitgOk`fw6DbEH
zr7^TgE9JV?8<sh#+c>JSw_{xOJ>6L?q%}tM=ARs3RA{Wtgc+%U5#w*p-lw4Y4)ox!
zsA$Q2ye{Toh<l#@072fW+)e_Oh!s=Szlpx5`U;Xl&Py{APjgT-d%9MwX`ooFgy)LU
zwPVI}QAchECafz3#sx}?MH;rdi5%coyfP}UT-QGYjB&}uX$b<x2{od2MDEG1?)IVP
zx%u7hB*4#FyELA2U{#wN09+Os_WuAH##fs<BGgeXk*AWNo)@|Gu7hY*3I+$I7V&LR
z0O!yOk@hFv`H%NUPsX~T9#nJAT+D!>U|S(qyC3t-MYt*35G$Upe)0bR3X(=?4?lqP
zrWFzJS9FJ>e~<W6WQ;4E5_YF!)Ssa5PiYnfASeufhp+wgd;L08GE7m98+50u`g;nh
zNhboO%E4jtOwQm8v$3U*hWU9Tt9pGu#;b<pfO_?BU;edB9b%5A_bS7j(;|(hauWa!
zS&sonsi7k?DUsjHgd7*gUtl|$Cc0lJMG)B#<SuiQ>Q8e=n-`$zrBH?#G9y2I&sYBd
z0qs>;K*j+lpaZ$3S1rPf=Yfv(D@-iTBKhu*%eZs1*?mvFE8KRl^(&_?Hi>WF<q?6A
z9(x~C?Nuk5K<#e}8E$0HohUz=gX!vjr8Jg}+6YK%wg=6VJVZN;`ud;1RaxC)fuVyt
zTZepL$^HxN`OxKM{eM^dfg-fB?ai&BGD#=PB&s=l{{RaA0JHr5m87X2muM5^d06w@
zI}YFODgOYzo7ht<EmF?k?X!cq#v>WQ!Swe&y@x%;RGLScW|8rnZ2)i-dJkb!vb!7C
zLKfX|cb0NV%R9H~O-}O&t^Ck7mr}5=bx!T|JpQ#_La0n`NI<@1<aOKGex&2`9+g<!
zJ;Sul0Q6DNW1t?wmF?<Ph%RK=8z|tDgZ2La>wj9W9D*xXex^p)K7*6}DK7W}C$>-H
zP+XFt7>Dr23HrAu`HITrq)+GYxg&v-_|t{RSXqh2SSb9BP@Y003O$GEPeLSxx8y1i
z#Qt=yGTNUu5xb9X#*jyhckfje;hoPI2a2^6z^)1U(irl9;Yh|tK9x>b0SbLjO1NJK
zIpowkx149AP&8ML05(2)91%`1g~obzse%o>9&?&=sPgDSY@pHYa%wfY2`E#+Clt3c
z43MqPDulU_44oZ#`ubHNGgRFyp@8e05!_T~_iY@bqPF5coo6dH^#`cyT5+j(fIfu&
zw9_SoX|m=>NcnjJuG=@3N=g2$xNmQIh(<j0az65@Kg;@6u@_&?t!_glfQY7RdANzV
zW7lt^52x}qWg>W8)rZWyb*RyI;Nz7TKH&7N2$V?}Q~t3BAMG#rkL6k=av1wNri$mu
zDjGJ7Gb!)RD_$7P5(y(YL!GDk8pMrH+rS;5&;C7Ic8|3PZdCOY;`%a8OL8{lN6Xz$
zZ~nDe357d(Q`8!x8yH`%RP;4$N;d9Z-9>Dla#{caWas7Yp4EXp%j<fGx3?r&!gB6E
zkNo!h>c*viB=dcbPI#K>2x*~i>U}f89;2=*-MkRl+n*r<sXLuVWBwkMokZpCt-6bL
zOeGPJG4lFT%*&A3>&AbjXWdDJGA}%4vredE&mf+KTcP$fT*+u?qS?7D+fG5+PwP-U
z>`r!1>BA19vYPYl7VRyoGX?wcH!PpxJ?kO$WRr1aZOPXiYFzA6r*om6IienHG6^Ca
z<$ABwRz>~B$c*XN_|bYdzCWc$ZK%kYlHOd7xXnuU7I*4*e&%EMiVTD9RDUt~8qKu!
zG>lq$ob;}fatKm-2IxoEx8qHP_Ayhq9_ndz3mNRnIr-sp@=AwtpT?qD0a20&9>3>`
z<dyHK)~#&|22<asOw|dl0x?23Jq=o53gA{f{^bOWj2ff}uX+`*>Um#UWv%VylL;*D
zWQ3lfTZ8o-4I;_rsa8F@)h1?wU9I;?;MBL(7MDhJF!+Kw+ZXzD5}t-ge{}x<rw8(@
zdvux3=x<)#i2Jt@@&5pQEpsICymR!aqhTUsAHt$j{$+DTPqR0lMZ9J6Y|K|9sSwH=
zKZjbX7UI@V-oY$H_H-ht+o=)^WlyN$wc@zcQ~?$A=sSGmkI2+}kH{Y~9Tn&+(*~q#
z#Q+Hq5$#9YFTnlix1avHK)L+JDl@5D^p{V!2itJ|wE0&?PQ8iZl_ZnoVh3!~+9+np
zW%-ACsxK|~Wh~?TBx=*JzPFqR<h6}GIVO{9t0G*{_pPCr<>7DwaaNbhnL%^~$A0y%
zGysp1IDqVSXZh7eRtE)$`f*uJC9^T9QZ*xDrt*>n!vF?BZYxe7nh_wCI()z!pVG2#
z?0(Y;8N&~{$F*kZTBNo&Ceq{;aC&w>N~pBa%kR<mmd9Nc-LYV#GoOE!ucW+XHmLR$
zk|U1b2_eRNf!?1M=+6ftnv+jsz7vH=-5N}>$lo!;d(~-dh~!|3P2`Z+%}TTG`Fdy8
zuIgKrs@SR*X~Co_ry$jdQ9;2K7QhB^QsZKiC4q#|gG7-5&MP;=_vvt(3FKDQzyT}O
zRPL^M3LVO*!*Bwx$AYzvB?$Pdu-x&AOG52p@dFi3bHNoapY2trW+x)8G$zW7gttn5
z&;>je^rUUoPQ=}fP!^OsX9Ad8Ja?%fQd>Bzw`RApB@wfVdO{Ruy<3R}-juRN3e#~X
zn5q~Krl68k+;C`>2q&7)bZz;`snHdwb9G3hHBb@}#Y-Icp|m>^%r=V5k%4}4eT7-N
zy2{{HTXgd!1lOHjZc|Z5HAi=3TUgo93}AW+O^m0dXhj%Q6v>wdfnD^~k)&f}Qpc@1
zW6dr(=~9njRLe+bmw{2^X{nCttuqy>T8X<BW0Vi2Ip=v!)jU>m%3ZdKzikYX<gNlQ
z-*z}v`kERQ%eJY2IOo*%u7b`VEdx2&r@88Wg0P{xg4~6-9&3)WxZr=fztq*pZk}c&
zxn}N3q>_nB?B5fNVK^SW%|uwc1zu0JQdv(3qbz$=i69>;9Ov&hudw#2=D4{_pKyG<
zx$ZlCKgO#{;Q>|Ze)E4ydrBAOA9(fls_t<^VzCj)Y*cT+z&v+8wJt^~76ph331h$m
zl1*mHAl>uhwM@u~iEfDtjHE!}pWONnf69^FEJ-9Rw_crAKk+}HsuHuv#pGhn+_5K<
zOKrn)OPlwd$3Uy{qNu=8>?$U9cUY4wNx%SPlj%@n=g(nQ$u<lkjZ0ri!Q07*AK@SE
zAJe5x+_hzFTSogJFh+OXH_FVs?9cG8zp1EYb8Y*W8BNmzzCd`0cftN`_zFltfEZ#S
z?q7#>JShI^pXO^O{^VraKyrkSF~{+rPxnV}&Z)jt{eM^dgq6(;Mn;K}AN514k%J-i
z_8x=r>sBqTqLJR><l@#2Tm7TiN7k!Y#>N=oOmBYYpdHEf9co*dmU$wzha%b&^5=rb
zyB~8xdR>FP?1Z|wdzNYB7#$UVx+=0D3%&m8jmO{Gq?LopRetL6MZ+q_sDAD@ztDF7
z06w&vySoc$W!)100J|sVW9CYI`wze$)|nh<810PI=^H$I-<Ob*YH=E^WxFWJxPZ$#
z#y^OKs4h?!cmDa~>Dsk0Jl1~72_KbI5VX(FK}n4n%>ii(<Ei4GBs(JV$EHuYt#7k|
z&NvlLRLr1z(Q6L(Gkl|fGm2K=f;x4puNDpm(xP~<4J!`iBP%yK?@%Sf36DH~TDj$6
zdT~_@ILV-7zi4n1vBqg7yJw8x^c>@{rj#Xz(yKP*ke<hiVj-GM#CA0|MnZZH4Kd@+
z2pwuv1&Zg6X^2A!S@G#wa4+2d0CyD-j51{X<p3Y2Z}6*;$AUe8swI;uaEGokKU%O8
z<_3{h<Q|2O^r_Sg1MgI3#fWWidV#_B6=j1&%E);fgHbAsb5lxpLBZq>D@ce~Wgpt?
z<NZ;$7#~*Teif|~sP9~@t>Z^^aIP`}ZeOVCKQ4a?)Pn3tnM26!LTg`8Q8&_c!B#wX
z{{R}RsOpDGw(|xaMmUX6M*jfoRamaG*QPtxQ(cWx=)&qmEO8Sac6y(o?@Dz!SD!*B
zriRvyZ4>X+Audd$kRR_M@6UXmhtjsvZRC|0an*ml^#1?~$cE?aWg`G8fwOOJJ5<xm
zs1|6+^0#1-)-&%_Pu*XcN)}coxrm6GTXXWBN_v0wtCqeVWW|(VHv|#T*H8A6Ri})C
z>Hf7xMjzf4%xd>7J@zcow&QUFo;`hswM|=C)|{;DX0_GDw7z6+%OK=)gHLk~v=xCC
zx`-bujuKPoeGlkrIbCNsVS;^oA4<;+hUeD@+PNJ@wCHxIC$CZBdz)K_j#OypPO5v=
zw4Eja<M|t@Uw>+qW`_PGhWW6{F%`}^UqIfY`Bh1+X0n*9s4{s39IFpdXl?Gc4K{n4
zkX}fkmvB~Zyc}*xJ(zSqTCF{t7j8wiEpGZ7c^l+^_Za^GBl()IygG5g6y;@_KPq0_
zD<0n8O0_GiG8}EHDJ+sjBSA7UNQVrLFjk?wD(SU{2fw9Qp7cj5+iGa?#-j-`{{XFs
zdmrLoWB6jBy0>(VU0r5@_>ST}RlSdJeUCIXZBWy`=Ra{9+eyL79%>huIQh9#S0{Wi
zNd_xAXz&R`$E8CyiY!FoSFS}^NXQ}MJa?>^q$tDRt!P0pLJ0gtVI!g_*|cC-1{J?+
z3a}N4WD;BvTaY6r2&~rRR4tov-E&puu!=+XdHPkz<R_9U!jO2tHKB#<U`wVL&&$%F
z0pNP*rHi`NIhHbT>4Q?@i&O44$512Wen-D*xZ7mpR#=5YmR74NBd<!yr?JO~qZvEL
zMq5-?7;(p~Qq=S)ucG0C%D-|!)PMD=<TnWTSK+;C$yQ&PO*p$4My#p(l@-y+>DSj9
zb=kDL1zk_zN2so@FqccZG6VM>wNC!V4Nl+6kZhHCj#1T3WLR9>X*Y(}@L}_i-JE)Z
zT3+htoBKD<P3Ux=GMpX=ps?@Or-h=uj5s5HKRQ;3fr6ZlwMks=r&=>tL@PKa2Bc6h
zN|E%Yv;zgUV-&&)jIME2siudTXxiDa)Iz&_q~g6AJM_4PST=Gi#xFrhOAdh6&W++X
zg#=Q$!02n!r&CKDT&JUzQzmNMN&@D&IJK2^1WBL9t;ei!_o^i;u{fP8N~_kH_Jk*s
zSifnKRNi=B=T&CAW#K{1E@LFMI%SqJDme~%S3hrgH*jkDU6a=$t`6pkD0}&WO!ljm
z((}(1Rc2Q1D#9ouiq$|;Pob)lAYQb|Za`CmS&q)0wNhKA8Km8a<uA!`o+<_e^HdV(
zm#b+u;aJGhC3zU5nv2Xku~J+H&T-bNO(&8vY9GQY?ulh!IVO?`f;i@@2Qr-YqiX^y
zl8q<N%*-K8xm}$E7h#85oJz0Ox$yEXHr2Rpj^GHc_?yXSY6t^29qHadQ|(T-V9E_k
zA;R^oc~KW-ciN(}ZjS7LII9;DVDfM(DC9{F2+bC;A!?RzMjvdKGm)KyY>f}O>feS1
zEyR=DW6pn(e(Y*Cs{H_~D-b;fDbEnaODP$V$sv3W6jqg`;gV5r1{sGW_8z33!n3A=
zX{GX^A2H`SzyrA_uoW$;BnV)DorLB|2*H&907V^#;%aO=of^!hEvVap`_KLF@D(gm
z2++uVaHG`LI!?^%1bz6cZ7LT4ACWj;e|k^3N%S<LlK{IOO)=&K^{nZxil_pc1IcqJ
zjX5~UB;Y9SN$e_lx)saR)rneS6$V(+&oF^XK-qFI(*FQO^d9}`pJ|OC@}O*?`_It7
z_z!BV(g_Fv({A3R+T=TMGfX`3PnE`YgTSX+qR8zeoI@Dg=hx7EPx#h$zJ&^->`;q>
zGc*4Hs9a|qw`2Kx)n{+8hEUnNek6S7U_DfO`k%wrrn|XloXpr4E<Qyc^2Kq-(Br3|
z_cfad@|W!`_p^wB_3qz?^ZHdcZ4i>arKzR+G=FDL^{>R3{t^A(@%*ZQi!x;N6zGm$
zapxzkXj;OF9G8;-VTYoh`Do{&{X3ujy+iky_9QR%TX7%!wv|DD_7k%GPgD3+WS3^<
z9eY)FndOn4iDqM#C%L3z#ACH5AV`|szG7JQIW)49BxLuIvG@=Ezw)GtJVwXx6>DHk
zqx*~c(4K%pDOeD9W6)G%I1JrSYJ7-gpK<3L3Y0=eBvQB)1Vw|KaezNsks%1RKEfkD
z<bPTiZCYsoQf`<PTgvLhRnZx9npq>l^rLVy3T26?WnAET)#r?=liZrDS9H0g1Pu-&
zHBI8j&{o68i0Xu?oG+=O!%V92;Dg$!%7Y*eTGn*=YMiNtUt><fxmX}B1xhXUu;YqU
z7@YnBtgX}>ZdsHM;xRTqJX5g6Sn$UqhF)saE0zjJY*O2`y~hzsu2-lBkSf~1JitKW
zor$>%0*Jp;{VGWFw1eE#(?$ps#auVg(qt2-DhI7Z(x@d#zz2i%qDI*0z5f8inxm*V
zSfq43vWCad=B+ZJ8?wFNk{f`*hVQ%&uealhjdcMAs9@-r00ZgNe>$xl>AQ#)z+^c3
zj%v12OK6?j_p7-#W3D+Lp{!K(b~dLCWSO$pb2B_jPRAI}r!^!6W6LW5K*H4s7{_d8
zla|L{TFGmvkg<IH-G3U*PnvgoBPc`KP0OM>s5N1DuWcrL^ce&GdLZ=oAIs@euCmaC
zAw+=uj)$X;<NDVQE+uA3B4U>)`CNBl-ns)JC~g>RE&(b1(~dx`-CJwas+OjU(&~=|
z*n4+%K-r00^N)Jfg61O;xwZ!f=9(eZPI`}Q{{TNjT)a*mLh>f~*kPBxJ-rQCQ7)Y_
zG;-!9`I205>5@M}Ju~f58k*hz0K*dqTUl;t-`K}6`)#2GPt7E2+tr7+sO?xXj7(T{
z?kiS1#<ZB+!Wm#MiDMj?9RC0?dwPy3x}}=QAKLAJ0AZaJ<Su#<>ZkPSMB#l=(+S%~
zP>GbQDEz9`z2c;QWw}EovE%{wqdu$t6_{pBkKsMOl<zlp%J|6cD>(b6k6p|6R-yp1
z#~^qg?n=E6vGhLGSg{<`mUmKHNb!&)g~!bsbZ`E>SeDv0h}_ze@P87Yyob>L0F7Ho
z>WNO;GdAH??mr>^{{UFq)7rBv?rwDHoFP)J#E{4`k`HnHdWzGYEDgZbS)&(7W$7aD
z`0wvZ?6i>Ut6P0W-R86mHVnxVa`Air0JOgU0G)GF#`fnp%AWPP3p^Jp;o3J-m61*e
zHDd18EkJ(kM-AJ5%RZ6Jf7$G<?a=neN-U+e<4wnNmH{LOvsCICRAaSAEwe)_0;L!n
zGLeD}IUMH|gwfR(bZzO<V5=#vrqV$m12{DS<{KI0#o4~cuAa(pR1EW3Ey<{Ed0Ac$
zHJ?1@2jz%0ZV9n~d8c06$8e(~9`&j@x|HG7j&RN~p7mkxV3ZO-IIdGtw79pB`DB%<
zme(>XVc3E@nym`uI_79(A1T^tqbff2P_nvN6bC%|invoehRCd>mCkv+Nl}Sx92&Q8
za`F79k?&c8Go8xZ`c=4#0hT<9o>Oi}$~KXrr6lD@2hi0^>sart%rh@eRlA|}6(lo8
zK~@LrPmo}6cQ5KGG}fr|>fxnL-WwQpH`f|n(6_li?!RUg*FkW%k%FuP{{T9|xw1>h
zN=P>;=togm_V+g$J<7#!#iBoQPR5AN#t@TE=<4_FB^Z$1!Jv*#R+|*C>JctmY=CgW
znTeosk{pWNO6Q|WqMGwOuTj&$fI5z%xl9z8fO;DA3*EnEz~|-SxOR^h1oq8#LF!=_
zdzyA$V8$5NNv6Wm&Y_9gIW>!?sK_F^T_|k@o2Oc-SSI;Z&3(PIg%s#*QbsuyS1%wq
zJOP@0ozY2f=e;DfxS_G99fB`WQ#GV&xxl9_!W7`uE8E?Xk_|LALv)K6EgN9gYU`3)
zOSQ&8tlPatc_b>J;;MOa%5r<vskks_ZD*;$dY}dX`c(5;2U10G_O|lI>e1vH>FsYM
zjxCdpeQRjSGeWiYDwu>)b5<jj7oICKRKJZ$!*f(^^$qR@>cVX&V!3Q|PcpibSl2Qc
zWhyg)TNX%@3^D6c-02F6TzzUR<*_X$W?RMb4At1})s1JHpCVv{a1Cl(OtOrMTWHGB
zXkCKI2<cC25T_std@{0u#aZ61)YZk3+9E{QF;QGewQ5#;bUCWe8_L}=S9U8S8uCdQ
zvNMXB!<PA~`&bt0I`^l=r?h94!1bkeJqgv6Eki3i{E>{ibm#Rx)opE9;3y=X0IHgk
z{hAhr5td<xVc*z$e>#~;%It|*6<+!{;xNVY=kLBTy#D})p!{jjj3Hyo8p0oFlH@MK
zjPqO3tbz~#!Nn-H^<>3gQSmpC9FqS45&2j8{{TvTxM?3G@j<m_DH%K<=jwSs&a`c9
zhbjmF46`1JJxxAq=!X)kTfFX7FaR5!*!-wHO;)(PRgBx)lD6RG*o-u%)cOxW+v`@Z
zuNC9_J-90XIVAOv`k&}KRz!=tipp~3*xfGVr&1vR6*O1JZtmxRD=I|@2{mHYHo#&q
z2TGMTXyUDNNJ`8A<gqvabtbeYxQ@aFjO}8C;B^v7(3Aek^!Fp{RICETlj<`PBDXsu
z!R4eQx40jcLG4+Eq`RG_kdAl?Iw&0g_7$ppvA<Si)!B77T}WBn{KT`l=L5N@u4MBZ
zU~Nz^tNqc~RE5M=qBQ%=es7@YKd<FblsiUqxcXEuA88X$I>f9L0lD3}B95f{3b}tj
zn(<r23c(o4sq4>k@7(=pD7J_!@s~e6$sXK<{5$)fYK3>R7jL}BHCrnScX8_&104r*
zO~GUGjy-C0h>@2)YGEHhI#k$WVThfodz_k>p?OFjjXp3Gk6%ijCFF5V#N1?d;Xv+b
z?GUR$Fe$7CD!ECyNzfv5>zbz~UBK)r+~l97S6I$@?^4P`qmPc9RC0`*Rjgqu76T-j
zVkt}jUwVe#4E)BdI++-HW~WHPrU0Z-PGrFLthq)VkEcr7n~()%yhM@@QfOd^f^Nk_
zG+9!8dRC8;RwCWU0+}>ncARwLlH3eHq&9sHYi7lPw@^C@&yq3bx(<pl^`}`*BZv&~
zlDztgWm`i0altWfGB9#*NZ{k}sGm=3W+uvKLDykyij8hIXKyD2fGMeam{G~~&$nt#
zs}VYPMC*Gn`7R?Hzq~l9qnRX?uyc?NRC&ORgUX(s)m+Fk^5kN(a%VeIi|$y|F19do
za4~`PteczXkr@fYDfv&YwON@Oc)$(F>6*{DP@GDbLbxZ;id@d;RT;#N+Dv?<af6dn
zt+q)+%I*wCdH{Jpk*xNWl58$ygpi?^oF9Km(zKP$z)TH}hwwk-3eVwoWmKOe<omD4
zexo2*B8w-J>B08;{{V$lkIR}8l4mEO@0yPGe9s^(c0bF{BO}t8r^76_PSPm_I486J
z0QKp`B^7bgwJKL-`xrcl+U%9xw5qn=fQ%o;oA#x+x;DV_Mh86)Z}aa?yE6TnQ3oul
zTmJyBj`*%h-f5wqd1fpdC!Vy{x|vgpto@_;9lSUMDH!cnQI}zcU-fKq5BCqR_|`Oc
z%QSFY=ZQY>{_BpvpI^qcB5>fTBC~D9gzwaBqH9Op8isUTu7{+t^~d-g+<H{8-53@*
zA(IiD`A);`=c=&u>__8Lq22^-jm!5K{{SQHQ29!Q;&6P4dV~BY*YN&TtSZ=0pK_k3
zX0jt(+eT4A+cZ7lZ|bM9tk~J3BX?ehHNS4MthfXVsD|I0kt%<5`q6WyLo{A|3AGPY
z?-}$y-iE46n^bkeX=ut2NT(U<D%=vuYcHD#>(q85(2Ati?yKvHK_jjJBryGIRIF&@
zb5?t3uHk8|hSCOJEcEvt)rtP5Z=0du`-;)DxoyFL*z3^L*S0Br8%YX<m~zpMhuwd|
zsq3oTOHCPxG&@|W<aGzuqTqsu<{0Q|(uVT3xcQ?v&$UvxQmwgjpK6DurL96)t@R9B
zUO$&O7>?<`MSoHL*HP{(BI4R>8@9K&LXJNp=t1s1>h08PaTee(%Si>uo)xpc4Dhe|
zxfEsLKHjV9zm+*PC9dVh@!Mf7&Hz1YR^5rgQb6xn%V#yL!5oHG8<EuITzZ3AF~q<J
zPNS`1%G0rKNt%}PI<O?>wG!_H2OL%`GRG?HakSOo14IC9k?TzvHFDL(ycZ*K^)-oi
zq{q0E!1k?JU87b<II9~LNb(NvtsY|7=Ha&cIsPIl%S*SAO<tPc4b$+dmlpX`fl~En
zB(;!5bGwHJ>MGnaIBcN>dxKa8;jN^@ZYx%j{{U_Z<chl;X=rbYsKNP$57MngaE(|o
zYH5%h40_d;R@fzt_bKO&)Fgz{HOK+CBMy61b3qN<O$?|odV~Ecj1z(eB*S_dnVhf8
za&u8CchwxVaLRhE8Je<O=$8Q`C9TQ%U(?vvQ)hn-^dTIKk@BBP%H18*Mk$wa5rdIg
zQbj(CD3UT{v?qi8+DX}*sY(*F(Uc~(W>rjcvaoLSZGgTjwuUhCl0`b|#yMjmyWpLT
zX}u3Tw-O}F7|$ma*F-#vTy!K;ZM1MNH<cJ(YSi!<4gkRDD4zB+t1f1d4V}@9zV3#Z
zHOi~*Q%;u2<4i9#NLor{<ndahYjm_VAiE|$rjev+4%6P28$yUF#0bbZ98i$kBU=S#
z$sE+*U>_&|3Q=~T;PFfLv5|(R;Kg*yw)Tn{c+M)uqX<|F4@z4(gGZbSTZj8Z1wRV7
zD<eod9%-vw%5E4htvg79>f&I;5V@{`eIX>cAdYig2AieD6e9{La$Jed<PlrX916+U
zQ`1(KBjwJgY2!7rn2cj@6<+GjDdRsTDX6&Wh*Cu<EaAIfF)`1jL1+!Mr5iZSGzs+i
zGP!8;(zY$u3(y<@8@*SvjP*llN!=30rIZjj^{1Vy)bm#&wAh#(DO%M}-XD!;$!06q
z&&VgeT(v;W$TemyJ~DV})3>n-1`S$^Tax6N3|5H4rA)R84;?D<LKIW57oLZ;4oFEd
zk+N993OPIz=~U#>;&aN@sW>#oHI$^8N(MjnMY<g98KrADR~&b(7Hk@?GH)0iNI3kd
zb96;G=wvVNrXc{y$t3Uq^)&_U$aMQW8S;V2Jqw>x{RKYb_i>n^IbY&$?;r52sik!U
z5JhFpt%3_t#n(G?ienB=1yar&xnf0HNkf2pQj*ZRnk$irH_8iitFgrMPaU)ix<@zz
z-kTWbC$(u<ui27oo|5@_AMEboN7VA)Ks{)u8?i=riJz%5hFdtFbQbwlBY@0)^L{7v
z0-!(<0Q~B2BQ5l#F?`snz1_>Tm4@dm39TzQOc9&L19&@O2m9UI@cwx8sBJA8dxTD&
zr1k@;tw}=2;GiNj8&Q1+`RDPgit1-B#&W~9H)G2=$3L0=l=O=ovTD;rb}l}(EKpz%
zS}X)97jyYj%5zqhFKSkgXz4|XxmGyIV^<=NVD~h{z#LTR&ot5#V0hxAZgM*gwIg@M
zCNP+xf+{k1rU<f0HFIF^%|{}f;8fUUcfx~I-!Js5(yth*{#njX0MN#Wl|qkt3+Jaw
zC19ORM9z3s7|$fqD#K(@P_f|VtFne|{+t?XyGbYsoq*tz-`<?qN7y<b`KetM%1J2!
zRYz{MTI7UWo^i<f(_QyIK{!1sDM6M7k%<eOl6b(S%nBtIT!!4=GjmWfHuR5?+~c73
zs}9k`iq1<i3=9rAsAjuWk~oTjoF2dDnwHHJM7RRk1dd5O<yWm%fQg}4W9|bEdHNsL
ztZFl!ahAr@i~>MEtz=kkwVPqaaN#}2T36^-EzzsLlNkg7(=^<L?^RlGU1S_&WQ_Ew
z6q4JRZBjU>XB9+FuJNIG_McU5L8P~kT!|u(scy<h`G>cyIxGjk9edSTVNo+HcjOV;
zr8@*|EzJA9Ijv+4OEMGkvZ!6Y#PzGT(g@{3!v*;O_Bh2$9npt$7emc~_|JS*StCJr
zZjeYGC2~O=@kG7r7g92mQkPDp1QM&3R7Tw2G8~m``qt_!cJOiXs2%ymQ?ZGbM`s`e
z9KULUd!uu3fk{}<{F&~3O;gu%N|gPpQClK8<(*{nK|kv<xZtJ*K_!z+rs6Ode;R>|
zA75(LSlp-%fzeOQzr?>o=~nGvNafud@Ou%~pEpCHMMY?1Y4AUnECdp+yOiUd%l&bh
z)3h!|(s>o4Z6jA(h(H7!vW}F=tkoZSa6PJJ8=_>Ci6WDJ6~6al>;C}Pt7Sy7^Y?{8
z6h2_d7(Ma)>dFQO=KJ5yx!P%*bEf056QN~SQ}f6Czxvf;8H%4d06nBb(EkASjSPSs
zj`YB$H^>|wzV%V5VqsUK8+l>~`$euyXR7y8=xZ_J>_!i*ZbIfKByy}UN2sP<S`H+P
z21C==>GiF-l$FlPbr!}M1qj>*M|!mj94jjj!;nX6ds~Cor@c4IXt`6Bq?ys93GKz%
zXN(1$j5-dEJxBOf45Y4Ah8P@Tw`I#HWjkDQ2<i=0x{f_Sj0H>0xyNZfp7o>mmzgWw
zentbGx!cgw5+ECJs|<x%mBu#l%_1(==kTmq&|S17qeS4nxj8C6@*nRnq5Tb8NZQec
z^S&1Qk^#dH=zXfRD&1Fx6>=N0xsFuxA%DKU{-fTSchQ-@Ab65h-3bK#wImX{0J3xa
zDjSO<3oF9^04s8iIv@VOwIo*=ZL)Gf`@)k=J%+5&qK>3s{Mf1O{!&7o4{E5YjKS29
zd;8V6Q;<Qd+a^sUOv)JNy+&<|j+v<obI8G|;Q6sp7kL~h-bPf~YBqeu{VOhAHSL>x
zV|0F%+e>V+AS82Cp2?FW46mWCo>QtbIL27rikz!et4Ur1jmK%=irKu<Bb09g2imim
z*b-tVYmS+zX`@vn(8<xk-x<j3iXgp}8|KL#^&~<_PT3bD)}JH^ox`E2a$~YAtby>#
z$DX35hIL%SCLIq2TB%1H54qG-&9*~^;r%FUnHf%;WUV6Yyd_(uSZKlRPK+O#pq^r!
z;=L$Z);#p|xZ@Ila5~cksK9Ns*zPx0qDEwkfyOFh9%(jZ8D3X1<hKr|HBoJUdC20n
zp4GMz1yG#%V<gs!HfGJ-U<j*%D<<u>u?t!|gr|d1NoH7s*A-HB98AQuc{w#Q*_G+m
zw7jsZjC86q+?b!I6)tYVvblEI+6YK)Y8bDCb27Q%jYOBK$&y7<xI*pbq4|0d=~ATb
zS}k1PM;!Uct(_+ICWrt=YlgkIo#pvKARe9S{*Ns7?g>^HJm$5h?-|ZVc>D~~q1>F-
zY<J>70X=H{w1M~oHBB@^Xs;!*$oo%2PhCD+XA(eir>LzjDM-!^eJNsWjiDS-9R{7A
zg_~2pKbwP6n2A4gBkNMyS~)v_I6aMOrK%1{=CpE8P;TaY8hSU)$Tgsmgct^%Kwbqj
zw>*mHoXSxH$0z>)*HJM!=A6x*wHux@ST<;o(%9=!vCn!!bKaQEg(uRY<=A0yAS51@
zUiHi{Iv>8sNzY;F{xp^kv`H!K#3((_{{UK{zJ2?H9F|f(yiv_%a$CAFyy7V)MV2~;
zc3aw~+45&Kj2jd+W*x*NHbLN1tcw?I!+}rKW{Eyo0blloL2L4iAue5j7_F1KG^rnT
znz69KW*Jq%BiGurUP88!D;&u=GdcYi^dGHdGXRAYj4x6<nzJ;!?g&%zqAHKT0YoKh
zqmqoTD-*;6u>-X{!}F8*Q#A9OlULzSI(1Rhwgpkt^d^okv)X4EdARy0{{R#GszgkI
zPo0CvcKy-vuh4X*jC_Rmtvrmp#Av}Ij+J6G80k^=%}*apQ(Kb9B6r6%9Fga(SqF-2
zfB~AT5>fJqH6Y+%Q;$6;E4GtBG>{HE(iWhpCyu6^Av_aOP~7Jjrn3Yd=B8x|ed;Zv
zIX!3@6E1l@Jt_%CIVYO6BZXS3=OnQoN{L9)F)2Ji9G*!%O;?jF%onFX2YR+on<hH@
z)pt=8g+Un|NUW}Ckr+W5<N=aDDu9pO=dMX4`qZ})0fb+=PtA`?h{!hWKg2RUu}6^X
zRGGwZ_AGsSdQuk>Mdi%cW&mv+MMrRakn%Twx=%s=H3X~!Jj5IxgQ4kCn=Z(!HPZd1
z)F{U8JMmdMg~OOh1mw8{dk$$WWRiQCi;O8?bM2fFO4jD&ODlcn$-wP`PxPR*WJNBj
z(g<S!Gco`P>JO%B>JRkTvJ05g1ZUGce?P*Zg3TehfT#@TEx)p2q*-H@NFcW5Oqr08
zjo;om^aB_k{<PPaJ9HR>Y=k2T9lmY&;~!7!Rug^zV1FL<OUVN9Pt3R^DIVMk(uizO
zcV{El^{H=DmaHs2axskLQqLcnpo7S#s^ojqwl;7&R&AV-c9BHk)W;w`$KI;>b6H$L
zDkveJYbo!?Ut?QP**wk)BL?^Ae!i6ss|+^LG>sva;a76`W8^?R`}M1YTh#ATQjaN_
z46|PsK@-NYovgm6*NVQD0H7^~$6;A|JaRM=GReo8jk)0Q*0n-OY(VHwJanp+u3U6l
z4vmaXayjU7dWx{E9}F@v)Ou4E7L;xs`x<h<$&s~{f#ci%0IgZ<TC*fco6J(AFwari
zr*|L+P%=0wdy0lM+S|TR-S&>vU@E#cK*(TeUEan{Nz}JtvF;dX$@+R#<#6aSpdV_k
ziGnuZA4--|(R|p+^rF`lW}Tw9Wa}7lfPS9FqhLn_Q5(lEsyC-<nTF)w+;G^e<pY|f
z8xR4-O%#E04=Q?!go20EQr0SNEXhg5*^ZXqoQB9f53NDvZ9dgzH{0@`x@pY0>-Z7g
zx+65sx^=o4w-)>oo%>c5^lW0>G94R&E2?xXK3d1WfCYb-2N}g~wmKs#nMo>YspZZ@
zcVJ*hcn|D-{p&JwJ*=%1szFnX8j|4|PU!meAW$!o@ddr(?P0`o5z6~FsHt|A`<pAT
zLgkIv0B+|se8dcUl;j?31jj9`-du`7?0XUFD^d%>suYZhh1rCcRA}3=SjsT-A#c8!
z$KpK=A%pBrtbj=i@~w`(-pAUi8B}1ddGAlOx-qjO<>6!VAwQ_}H2JNeE;lt`xwm`<
z8%<pTJ-;ce&$F&om9mY>Kd<;7YP5(^D;ZsGa#f=ktHrbuvu-@oV!4qR5CIvcIp(Lf
zi+9qZ7jHv~Dx-IcEvB?o;i5S9s}bs!(u@`=eZ^Li%rbnq$f|N$73eExIm2?@T9Mq@
z#7@~qQ}i`T_e~cADn4=0ip~*Ar{&1{RG(?NhxpIqN3~CMX&hH{Rhk*%;ei<ML^6;z
zN}!I|t7A}<j5@3RMMe@iB-~J`_NjA@>76mf(|eM_vB}D|-)gY-ir^3iM{!L^&>XI6
z!i)-!Xk%9uAF|6aTd)PNNc5_AknUlQ^if{SqLeOkrul4C7VORydz@EKYXYPoA4({w
zq40_c6PQCG=~#D`8{GDyirPx$r>RaG285DG+f%9u4il{uQbLn=8uo6@hmC;s6n|g~
zfC8{=^v@JgSIC+b+GEqf<nUuS{{UsGczhWoa)LQS)Hm{?ipi@x7k14v;M)nJRhBM5
zz&Yf8G}!zn4Y|US%kF5RlYFm1B$;7+HEaAxGCC1f;L)wovdRuR9+Xi_o>wcC7uLqy
ze+a1syi3tV6jO3xWIb@50HiWviYgqLN+@Y;ifK6*2Z|`EWmt`cLm}=@6j4yiMQI-&
zcA+vBQlsx<y%bh`N_#Sml)I6@KPY30t$%P)#|Mr@6jV0(OriWUI{-f&4<qSLGZaz@
z!J>*$9Ta8XymBe?7c~Z+!rm60TO^YlOYi(g@Em<lwG>v=x|`MbOY!7@xW^S@5tH?z
zik0ymWi2X~p!qZOtqT|Vq?7xL!S_6WQ$-cB-0P7ja4<45dJ3?I<i!+KAu+(@jC2(`
z9GWPmiDUP6rb!_u0*WfG#bFRrh&lG6icRP=kl+@MLH0CJQ+7Kcj2ND^8A5oXicOVb
zyoh`H)Jng+!Stev&F)$iZLyLE)YVy`4hX=_6jJCkQn|M(0o%qg!0S~OHd%&G1CUQ@
zD5+(klgEHS>6~Emilp|<f=d$Q^v^-<MHH^?RkSM`ADp>gmPI)Su;U-rszs-}x)mIz
zcIW%ub43(fYhr7CA>o4I)xLRxf)5~M<+%R<J!<8?lriUJLlKS#zcf)oVr-H0q*MjA
z<%Ti`e&(7iqf*SkWSryDiYSqVO^DqT9zRN!9itntXri-woD%qpg~q}F;QCcdn>gf!
z%!P|O?c3A}D6Jswc2hZx4>tEwMI&P*sXq15C`9=VHucCkZ~nCuR*!Q;D%(QNUKt5-
z*qXYNsX())03Cg(qLq<Isl)#DN66<Z^rA7AWyo>2rg;=mSlixa6t!|>k?m8tl#F*K
znA(lIzS$MWZNrg86uGRfZ9#L*x6ofR=E{*G_Bg>6rD*Eo89a`2MHO8Uw4$BH&mPvn
z%ChmE#CE1HBOa7dSk#KS$v7(x#wqCBVDzGj=T6T<o#^x#AS!YX<5ezp%7s`A;{ee`
zbyJhM(N0dsDRUD>=dsU9t1L+<Ag&Ls6jey{;O`rY9FnXaLH*({DW0qJG$iNlvXDg-
zQoCbm7ChEL>J3<qMJI6`Xrh#ajVnofx7tZ~U+*ucxc9ALl^aeONcmWI6j4zp?wq)t
zitaGgdc^QJ1R5x%OLLNeq|qy;Q@2{JBu|1BvONtHS6#F^8c%Yf+(`_ZWsP?FikoyS
zH~C^fdr?JZCqka~A~C>8A~L?&rwCN?2kS)@%Tb(_p%t+z#s_MC0HTVBE3=KC|Jir}
Bp^E?j

diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
deleted file mode 100644
index 63905c04c..000000000
--- a/python/paddle/v2/tests/test_data_feeder.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import py_paddle.swig_paddle as api
-import numpy as np
-
-from paddle.v2 import data_type
-from paddle.v2.data_feeder import DataFeeder
-
-
-class DataFeederTest(unittest.TestCase):
-    def dense_reader(self, size):
-        data = np.random.random(size)
-        return data
-
-    def sparse_binary_reader(self, high, size_limit, non_empty=False):
-        num = np.random.randint(size_limit)  # num could be 0
-        while non_empty and num == 0:
-            num = np.random.randint(size_limit)
-        return np.random.randint(high, size=num).tolist()
-
-    def test_dense(self):
-        def compare(input):
-            feeder = DataFeeder([('image', data_type.dense_vector(784))],
-                                {'image': 0})
-            arg = feeder(input)
-            output = arg.getSlotValue(0).copyToNumpyMat()
-            input = np.array(input, dtype='float32')
-            self.assertAlmostEqual(input.all(), output.all())
-
-        # test numpy array
-        batch_size = 32
-        dim = 784
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(self.dense_reader(dim))
-            data.append(each_sample)
-        compare(data)
-
-        # each feature is a list
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(self.dense_reader(dim).tolist())
-            data.append(each_sample)
-        compare(data)
-
-        # test tuple
-        data = []
-        for i in xrange(batch_size):
-            each_sample = (self.dense_reader(dim).tolist(), )
-            data.append(each_sample)
-        compare(data)
-
-    def test_sparse_binary(self):
-        dim = 10000
-        batch_size = 32
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(self.sparse_binary_reader(dim, 50))
-            data.append(each_sample)
-        feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))],
-                            {'input': 0})
-        arg = feeder(data)
-        output = arg.getSlotValue(0)
-        assert isinstance(output, api.Matrix)
-        for i in xrange(batch_size):
-            self.assertEqual(output.getSparseRowCols(i), data[i][0])
-
-    def test_sparse(self):
-        dim = 10000
-        batch_size = 32
-        v = []
-        w = []
-        data = []
-        for dat in xrange(batch_size):
-            each_sample = []
-            a = self.sparse_binary_reader(dim, 40, non_empty=True)
-            b = self.dense_reader(len(a)).tolist()
-            v.append(a)
-            w.append(np.array(b, dtype="float32"))
-            each_sample.append(zip(a, b))
-            data.append(each_sample)
-
-        feeder = DataFeeder([('input', data_type.sparse_float_vector(dim))],
-                            {'input': 0})
-        arg = feeder(data)
-        output = arg.getSlotValue(0)
-        assert isinstance(output, api.Matrix)
-        for i in xrange(batch_size):
-            self.assertEqual(output.getSparseRowCols(i), v[i])
-            cols_value = output.getSparseRowColsVal(i)
-            value = [val[1] for val in cols_value]
-            value = np.array(value, dtype="float32")
-            self.assertAlmostEqual(value.all(), w[i].all())
-
-    def test_integer(self):
-        value_range = 100
-        batch_size = 32
-        index = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(np.random.randint(value_range))
-            index.append(each_sample)
-        feeder = DataFeeder([('input', data_type.integer_value(value_range))],
-                            {'input': 0})
-        arg = feeder(index)
-        output = arg.getSlotIds(0).copyToNumpyArray()
-        index = np.array(index, dtype='int')
-        self.assertEqual(output.all(), index.flatten().all())
-
-    def test_integer_sequence(self):
-        value_range = 10000
-        batch_size = 32
-        start = [0]
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(
-                self.sparse_binary_reader(
-                    value_range, 30, non_empty=True))
-            data.append(each_sample)
-            start.append(len(each_sample[0]) + start[-1])
-        feeder = DataFeeder(
-            [('input', data_type.integer_value_sequence(value_range))],
-            {'input': 0})
-        arg = feeder(data)
-        output_data = arg.getSlotIds(0).copyToNumpyArray()
-        output_start = arg.getSlotSequenceStartPositions(0).copyToNumpyArray()
-
-        index = []
-        for dat in data:
-            index.extend(x for x in dat[0])  # only one feature, so dat[0]
-        index = np.array(index, dtype='int')
-        start = np.array(start, dtype='int')
-        self.assertEqual(output_data.all(), index.all())
-        self.assertEqual(output_start.all(), start.all())
-
-    def test_multiple_features(self):
-        batch_size = 2
-        data = []
-        for i in xrange(batch_size):
-            each_sample = []
-            each_sample.append(np.random.randint(10))
-            each_sample.append(
-                self.sparse_binary_reader(
-                    20000, 40, non_empty=True))
-            each_sample.append(self.dense_reader(100))
-            data.append(each_sample)
-
-        # test multiple features
-        data_types = [('fea0', data_type.dense_vector(100)),
-                      ('fea1', data_type.sparse_binary_vector(20000)),
-                      ('fea2', data_type.integer_value(10))]
-        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
-        arg = feeder(data)
-        output_dense = arg.getSlotValue(0).copyToNumpyMat()
-        output_sparse = arg.getSlotValue(1)
-        output_index = arg.getSlotIds(2).copyToNumpyArray()
-        for i in xrange(batch_size):
-            self.assertEqual(output_dense[i].all(), data[i][2].all())
-            self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1])
-            self.assertEqual(output_index[i], data[i][0])
-
-        # reader returns 3 features, but only use 2 features
-        data_types = [('fea0', data_type.dense_vector(100)),
-                      ('fea2', data_type.integer_value(10))]
-        feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0})
-        arg = feeder(data)
-        output_dense = arg.getSlotValue(0).copyToNumpyMat()
-        output_index = arg.getSlotIds(1).copyToNumpyArray()
-        for i in xrange(batch_size):
-            self.assertEqual(output_dense[i].all(), data[i][2].all())
-            self.assertEqual(output_index[i], data[i][0])
-
-        # reader returns 3 featreus, one is duplicate data
-        data_types = [('fea0', data_type.dense_vector(100)),
-                      ('fea1', data_type.sparse_binary_vector(20000)),
-                      ('fea2', data_type.integer_value(10)),
-                      ('fea3', data_type.dense_vector(100))]
-        feeder = DataFeeder(data_types,
-                            {'fea0': 2,
-                             'fea1': 1,
-                             'fea2': 0,
-                             'fea3': 2})
-        arg = feeder(data)
-        fea0 = arg.getSlotValue(0).copyToNumpyMat()
-        fea1 = arg.getSlotValue(1)
-        fea2 = arg.getSlotIds(2).copyToNumpyArray()
-        fea3 = arg.getSlotValue(3).copyToNumpyMat()
-        for i in xrange(batch_size):
-            self.assertEqual(fea0[i].all(), data[i][2].all())
-            self.assertEqual(fea1.getSparseRowCols(i), data[i][1])
-            self.assertEqual(fea2[i], data[i][0])
-            self.assertEqual(fea3[i].all(), data[i][2].all())
-
-    def test_multiple_features_tuple(self):
-        batch_size = 2
-        data = []
-        for i in xrange(batch_size):
-            a = np.random.randint(10)
-            b = self.sparse_binary_reader(20000, 40, non_empty=True)
-            c = self.dense_reader(100)
-            each_sample = (a, b, c)
-            data.append(each_sample)
-
-        # test multiple features
-        data_types = [('fea0', data_type.dense_vector(100)),
-                      ('fea1', data_type.sparse_binary_vector(20000)),
-                      ('fea2', data_type.integer_value(10))]
-        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
-        arg = feeder(data)
-        out_dense = arg.getSlotValue(0).copyToNumpyMat()
-        out_sparse = arg.getSlotValue(1)
-        out_index = arg.getSlotIds(2).copyToNumpyArray()
-        for i in xrange(batch_size):
-            self.assertEqual(out_dense[i].all(), data[i][2].all())
-            self.assertEqual(out_sparse.getSparseRowCols(i), data[i][1])
-            self.assertEqual(out_index[i], data[i][0])
-
-    def test_dense_set_shape(self):
-        # test 2-D data
-        def gen_data(batch_size, shape):
-            data = []
-            for i in xrange(batch_size):
-                each_sample = []
-                each_sample.append(np.random.random(shape))
-                data.append(each_sample)
-            return data
-
-        feeder = DataFeeder([('image', data_type.dense_array(2352))],
-                            {'image': 0})
-        arg = feeder(gen_data(32, (3, 28, 28)))
-        h = arg.getSlotFrameHeight(0)
-        w = arg.getSlotFrameWidth(0)
-        self.assertEqual(h, 28)
-        self.assertEqual(w, 28)
-
-        arg = feeder(gen_data(32, (3, 30, 32)))
-        h = arg.getSlotFrameHeight(0)
-        w = arg.getSlotFrameWidth(0)
-        self.assertEqual(h, 30)
-        self.assertEqual(w, 32)
-
-
-if __name__ == '__main__':
-    api.initPaddle("--use_gpu=0")
-    suite = unittest.TestLoader().loadTestsFromTestCase(DataFeederTest)
-    unittest.TextTestRunner().run(suite)
-    if api.isGpuVersion():
-        api.setUseGpu(True)
-        unittest.main()
diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py
deleted file mode 100644
index c78bbdc40..000000000
--- a/python/paddle/v2/tests/test_image.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-
-import paddle.v2.image as image
-
-
-class Image(unittest.TestCase):
-    def test_resize_flip_chw(self):
-        # resize
-        im = image.load_image('cat.jpg')
-        im = image.resize_short(im, 256)
-        self.assertEqual(256, min(im.shape[:2]))
-        self.assertEqual(3, im.shape[2])
-
-        # flip
-        im = image.left_right_flip(im)
-        im2 = np.flip(im, 1)
-        self.assertEqual(im.all(), im2.all())
-
-        # to_chw
-        h, w, c = im.shape
-        im = image.to_chw(im)
-        self.assertEqual(c, im.shape[0])
-        self.assertEqual(h, im.shape[1])
-        self.assertEqual(w, im.shape[2])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
deleted file mode 100644
index b169a0f38..000000000
--- a/python/paddle/v2/tests/test_layer.py
+++ /dev/null
@@ -1,290 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle.v2.activation as activation
-import paddle.v2.attr as attr
-import paddle.v2.data_type as data_type
-import paddle.v2.layer as layer
-import paddle.v2.pooling as pooling
-import paddle.v2.networks as networks
-import paddle.v2.evaluator as evaluator
-
-pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
-label = layer.data(name='label', type=data_type.integer_value(10))
-weight = layer.data(name='weight', type=data_type.dense_vector(1))
-combine_weight = layer.data(
-    name='weight_combine', type=data_type.dense_vector(10))
-score = layer.data(name='score', type=data_type.dense_vector(1))
-
-hidden = layer.fc(input=pixel,
-                  size=100,
-                  act=activation.Sigmoid(),
-                  param_attr=attr.Param(name='hidden'))
-inference = layer.fc(input=hidden, size=10, act=activation.Softmax())
-conv = layer.img_conv(
-    input=pixel,
-    filter_size=1,
-    filter_size_y=1,
-    num_channels=8,
-    num_filters=16,
-    act=activation.Linear())
-
-
-class ImageLayerTest(unittest.TestCase):
-    def test_conv_layer(self):
-        conv_shift = layer.conv_shift(a=pixel, b=score)
-        print layer.parse_network(conv, conv_shift)
-
-    def test_pooling_layer(self):
-        maxpool = layer.img_pool(
-            input=conv,
-            pool_size=2,
-            num_channels=16,
-            padding=1,
-            pool_type=pooling.Max())
-        spp = layer.spp(input=conv,
-                        pyramid_height=2,
-                        num_channels=16,
-                        pool_type=pooling.Max())
-        maxout = layer.maxout(input=conv, num_channels=16, groups=4)
-        print layer.parse_network([maxpool, spp, maxout])
-
-    def test_norm_layer(self):
-        norm1 = layer.img_cmrnorm(input=conv, size=5)
-        norm2 = layer.batch_norm(input=conv)
-        norm3 = layer.sum_to_one_norm(input=conv)
-        print layer.parse_network([norm1, norm2, norm3])
-
-
-class AggregateLayerTest(unittest.TestCase):
-    def test_aggregate_layer(self):
-        pool = layer.pooling(
-            input=pixel,
-            pooling_type=pooling.Avg(),
-            agg_level=layer.AggregateLevel.TO_SEQUENCE)
-        last_seq = layer.last_seq(input=pixel)
-        first_seq = layer.first_seq(input=pixel)
-        concat = layer.concat(input=[last_seq, first_seq])
-        seq_concat = layer.seq_concat(a=last_seq, b=first_seq)
-        print layer.parse_network(
-            [pool, last_seq, first_seq, concat, seq_concat])
-
-
-class MathLayerTest(unittest.TestCase):
-    def test_math_layer(self):
-        addto = layer.addto(input=[pixel, pixel])
-        linear_comb = layer.linear_comb(
-            weights=combine_weight, vectors=hidden, size=10)
-        interpolation = layer.interpolation(
-            input=[hidden, hidden], weight=score)
-        bilinear = layer.bilinear_interp(input=conv, out_size_x=4, out_size_y=4)
-        power = layer.power(input=pixel, weight=score)
-        scaling = layer.scaling(input=pixel, weight=score)
-        slope = layer.slope_intercept(input=pixel)
-        tensor = layer.tensor(a=pixel, b=pixel, size=1000)
-        cos_sim = layer.cos_sim(a=pixel, b=pixel)
-        trans = layer.trans(input=tensor)
-        print layer.parse_network([
-            addto, linear_comb, interpolation, power, scaling, slope, tensor,
-            cos_sim, trans
-        ])
-
-
-class ReshapeLayerTest(unittest.TestCase):
-    def test_reshape_layer(self):
-        block_expand = layer.block_expand(
-            input=conv, num_channels=4, stride_x=1, block_x=1)
-        expand = layer.expand(
-            input=weight,
-            expand_as=pixel,
-            expand_level=layer.ExpandLevel.FROM_NO_SEQUENCE)
-        repeat = layer.repeat(input=pixel, num_repeats=4)
-        reshape = layer.seq_reshape(input=pixel, reshape_size=4)
-        rotate = layer.rotate(input=pixel, height=16, width=49)
-        print layer.parse_network(
-            [block_expand, expand, repeat, reshape, rotate])
-
-
-class RecurrentLayerTest(unittest.TestCase):
-    def test_recurrent_layer(self):
-        word = layer.data(name='word', type=data_type.integer_value(12))
-        recurrent = layer.recurrent(input=word)
-        lstm = layer.lstmemory(input=word)
-        gru = layer.grumemory(input=word)
-        print layer.parse_network([recurrent, lstm, gru])
-
-
-class CostLayerTest(unittest.TestCase):
-    def test_cost_layer(self):
-        cost1 = layer.classification_cost(input=inference, label=label)
-        cost2 = layer.classification_cost(
-            input=inference, label=label, weight=weight)
-        cost3 = layer.cross_entropy_cost(input=inference, label=label)
-        cost4 = layer.cross_entropy_with_selfnorm_cost(
-            input=inference, label=label)
-        cost5 = layer.square_error_cost(input=inference, label=label)
-        cost6 = layer.square_error_cost(
-            input=inference, label=label, weight=weight)
-        cost7 = layer.multi_binary_label_cross_entropy_cost(
-            input=inference, label=label)
-        cost8 = layer.rank_cost(left=score, right=score, label=score)
-        cost9 = layer.lambda_cost(input=inference, score=score)
-        cost10 = layer.sum_cost(input=inference)
-        cost11 = layer.huber_regression_cost(input=score, label=label)
-        cost12 = layer.huber_classification_cost(input=score, label=label)
-
-        print layer.parse_network([cost1, cost2])
-        print layer.parse_network([cost3, cost4])
-        print layer.parse_network([cost5, cost6])
-        print layer.parse_network([cost7, cost8, cost9, cost10, cost11, cost12])
-
-        crf = layer.crf(input=inference, label=label)
-        crf_decoding = layer.crf_decoding(input=inference, size=3)
-        ctc = layer.ctc(input=inference, label=label)
-        warp_ctc = layer.warp_ctc(input=pixel, label=label)
-        nce = layer.nce(input=inference, label=label, num_classes=3)
-        hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3)
-
-        print layer.parse_network(
-            [crf, crf_decoding, ctc, warp_ctc, nce, hsigmoid])
-
-
-class OtherLayerTest(unittest.TestCase):
-    def test_sampling_layer(self):
-        maxid = layer.max_id(input=inference)
-        sampling_id = layer.sampling_id(input=inference)
-        eos = layer.eos(input=maxid, eos_id=5)
-        layer.printer(maxid)
-        print layer.parse_network([maxid, sampling_id, eos])
-
-    def test_slicing_joining_layer(self):
-        pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
-        print layer.parse_network(pad)
-
-
-class ProjOpTest(unittest.TestCase):
-    def test_projection(self):
-        input = layer.data(name='data2', type=data_type.dense_vector(784))
-        word = layer.data(
-            name='word2', type=data_type.integer_value_sequence(10000))
-        fc0 = layer.fc(input=input, size=100, act=activation.Sigmoid())
-        fc1 = layer.fc(input=input, size=200, act=activation.Sigmoid())
-        mixed0 = layer.mixed(
-            size=256,
-            input=[
-                layer.full_matrix_projection(input=fc0),
-                layer.full_matrix_projection(input=fc1)
-            ])
-        with layer.mixed(size=200) as mixed1:
-            mixed1 += layer.full_matrix_projection(input=fc0)
-            mixed1 += layer.identity_projection(input=fc1)
-
-        table = layer.table_projection(input=word)
-        emb0 = layer.mixed(size=512, input=table)
-        with layer.mixed(size=512) as emb1:
-            emb1 += table
-
-        scale = layer.scaling_projection(input=fc0)
-        scale0 = layer.mixed(size=100, input=scale)
-        with layer.mixed(size=100) as scale1:
-            scale1 += scale
-
-        dotmul = layer.dotmul_projection(input=fc0)
-        dotmul0 = layer.mixed(size=100, input=dotmul)
-        with layer.mixed(size=100) as dotmul1:
-            dotmul1 += dotmul
-
-        context = layer.context_projection(input=fc0, context_len=5)
-        context0 = layer.mixed(size=500, input=context)
-        with layer.mixed(size=500) as context1:
-            context1 += context
-
-        conv = layer.conv_projection(
-            input=input,
-            filter_size=1,
-            num_channels=1,
-            num_filters=128,
-            stride=1,
-            padding=0)
-        conv0 = layer.mixed(input=conv, bias_attr=True)
-        with layer.mixed(bias_attr=True) as conv1:
-            conv1 += conv
-
-        print layer.parse_network(mixed0)
-        print layer.parse_network(mixed1)
-        print layer.parse_network(emb0)
-        print layer.parse_network(emb1)
-        print layer.parse_network(scale0)
-        print layer.parse_network(scale1)
-        print layer.parse_network(dotmul0)
-        print layer.parse_network(dotmul1)
-        print layer.parse_network(conv0)
-        print layer.parse_network(conv1)
-
-    def test_operator(self):
-        ipt0 = layer.data(name='data1', type=data_type.dense_vector(784))
-        ipt1 = layer.data(name='word1', type=data_type.dense_vector(128))
-        fc0 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
-        fc1 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
-
-        dotmul_op = layer.dotmul_operator(a=fc0, b=fc1)
-        dotmul0 = layer.mixed(input=dotmul_op)
-        with layer.mixed() as dotmul1:
-            dotmul1 += dotmul_op
-
-        conv = layer.conv_operator(
-            img=ipt0,
-            filter=ipt1,
-            filter_size=1,
-            num_channels=1,
-            num_filters=128,
-            stride=1,
-            padding=0)
-        conv0 = layer.mixed(input=conv)
-        with layer.mixed() as conv1:
-            conv1 += conv
-
-        print layer.parse_network(dotmul0)
-        print layer.parse_network(dotmul1)
-        print layer.parse_network(conv0)
-        print layer.parse_network(conv1)
-
-
-class NetworkTests(unittest.TestCase):
-    def test_vgg(self):
-        img = layer.data(name='pixel1', type=data_type.dense_vector(784))
-        vgg_out = networks.small_vgg(
-            input_image=img, num_channels=1, num_classes=2)
-        print layer.parse_network(vgg_out)
-
-
-class EvaluatorTest(unittest.TestCase):
-    def test_evaluator(self):
-        img = layer.data(name='pixel2', type=data_type.dense_vector(784))
-        output = layer.fc(input=img,
-                          size=10,
-                          act=activation.Softmax(),
-                          name='fc_here')
-        lbl = layer.data(name='label2', type=data_type.integer_value(10))
-        cost = layer.cross_entropy_cost(input=output, label=lbl)
-
-        evaluator.classification_error(input=output, label=lbl)
-        print layer.parse_network(cost)
-        print layer.parse_network(output)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_op.py b/python/paddle/v2/tests/test_op.py
deleted file mode 100644
index 15d5aef51..000000000
--- a/python/paddle/v2/tests/test_op.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle.v2.data_type as data_type
-import paddle.v2.layer as layer
-import paddle.v2.op as op
-
-
-class OpTest(unittest.TestCase):
-    def test_op(self):
-        x = layer.data(name='data', type=data_type.dense_vector(128))
-        x = op.exp(x)
-        x = op.sqrt(x)
-        x = op.reciprocal(x)
-        x = op.log(x)
-        x = op.abs(x)
-        x = op.sigmoid(x)
-        x = op.tanh(x)
-        x = op.square(x)
-        x = op.relu(x)
-        y = 1 + x
-        y = y + 1
-        y = x + y
-        y = y - x
-        y = y - 2
-        y = 2 - y
-        y = 2 * y
-        y = y * 3
-        z = layer.data(name='data_2', type=data_type.dense_vector(1))
-        y = y * z
-        y = z * y
-        y = y + z
-        y = z + y
-        print layer.parse_network(y)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py
deleted file mode 100644
index 264442be1..000000000
--- a/python/paddle/v2/tests/test_paramconf_order.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright PaddlePaddle contributors. All Rights Reservedd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-import math
-import paddle.v2 as paddle
-
-
-def wordemb(inlayer):
-    wordemb = paddle.layer.table_projection(
-        input=inlayer,
-        size=5,
-        param_attr=paddle.attr.Param(
-            name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0))
-    return wordemb
-
-
-def train():
-    word_dict = paddle.dataset.imikolov.build_dict()
-    dict_size = len(word_dict)
-    # Every layer takes integer value of range [0, dict_size)
-    firstword = paddle.layer.data(
-        name="firstw", type=paddle.data_type.integer_value(dict_size))
-    secondword = paddle.layer.data(
-        name="secondw", type=paddle.data_type.integer_value(dict_size))
-    thirdword = paddle.layer.data(
-        name="thirdw", type=paddle.data_type.integer_value(dict_size))
-    fourthword = paddle.layer.data(
-        name="fourthw", type=paddle.data_type.integer_value(dict_size))
-    nextword = paddle.layer.data(
-        name="fifthw", type=paddle.data_type.integer_value(dict_size))
-
-    Efirst = wordemb(firstword)
-    Esecond = wordemb(secondword)
-    Ethird = wordemb(thirdword)
-    Efourth = wordemb(fourthword)
-
-    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
-    hidden1 = paddle.layer.fc(name="fc1",
-                              input=contextemb,
-                              size=128,
-                              act=paddle.activation.Sigmoid(),
-                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
-                              bias_attr=paddle.attr.Param(learning_rate=2),
-                              param_attr=paddle.attr.Param(
-                                  initial_std=1. / math.sqrt(5 * 8),
-                                  learning_rate=1,
-                                  l2_rate=6e-4))
-    predictword = paddle.layer.fc(input=hidden1,
-                                  size=dict_size,
-                                  bias_attr=paddle.attr.Param(learning_rate=2),
-                                  act=paddle.activation.Softmax())
-
-    return paddle.layer.classification_cost(input=predictword, label=nextword)
-
-
-class TestParamConfOrder(unittest.TestCase):
-    def test_param_conf_order(self):
-        paddle.init()
-        cost = train()
-        parameters = paddle.parameters.create(cost)
-        adagrad = paddle.optimizer.AdaGrad(
-            learning_rate=3e-3,
-            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
-
-        trainer = paddle.trainer.SGD(cost, parameters, adagrad)
-        for p in trainer.get_topology_proto().parameters:
-            if p.name == "_fc1.w0":
-                self.assertEqual(p.decay_rate, 6e-4)
-            else:
-                self.assertEqual(p.decay_rate, 8e-4)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
deleted file mode 100644
index 3bfd9348a..000000000
--- a/python/paddle/v2/tests/test_parameters.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import sys
-
-try:
-    import py_paddle
-
-    del py_paddle
-except ImportError:
-    print >> sys.stderr, "It seems swig of Paddle is not installed, this " \
-                         "unittest will not be run."
-    sys.exit(0)
-
-import paddle.v2.parameters as parameters
-import paddle.v2.data_type as data_type
-import paddle.v2.layer as layer
-from paddle.v2.attr import ParamAttr
-from paddle.proto.ParameterConfig_pb2 import ParameterConfig
-import random
-import cStringIO
-import numpy
-
-
-def __rand_param_config__(name, psize=None):
-    conf = ParameterConfig()
-    conf.name = name
-    size = 1
-    if psize is None:
-        for i in xrange(2):
-            dim = random.randint(1, 1000)
-            conf.dims.append(dim)
-            size *= dim
-    else:
-        size = psize
-    conf.size = size
-    assert conf.IsInitialized()
-    return conf
-
-
-class TestParameters(unittest.TestCase):
-    def test_serialization(self):
-        params = parameters.Parameters()
-        params.__append_config__(__rand_param_config__("param_0"))
-        params.__append_config__(__rand_param_config__("param_1"))
-
-        for name in params.names():
-            param = params.get(name)
-            param[:] = numpy.random.uniform(
-                -1.0, 1.0, size=params.get_shape(name))
-            params.set(name, param)
-
-        tmp_file = cStringIO.StringIO()
-        params.to_tar(tmp_file)
-        tmp_file.seek(0)
-        params_dup = parameters.Parameters.from_tar(tmp_file)
-
-        self.assertEqual(params_dup.names(), params.names())
-
-        for name in params.names():
-            self.assertEqual(params.get_shape(name), params_dup.get_shape(name))
-            p0 = params.get(name)
-            p1 = params_dup.get(name)
-            self.assertTrue(numpy.isclose(p0, p1).all())
-
-    def test_initializer(self):
-        def initializer(name):
-            assert name == "fc.w"
-            mat = numpy.ones((3, 2), dtype=numpy.float32)
-            mat[1, 1] = 2
-            return mat
-
-        x = layer.data(name="x", type=data_type.dense_vector(3))
-        y = layer.fc(x,
-                     size=2,
-                     bias_attr=False,
-                     param_attr=ParamAttr(
-                         name="fc.w", initializer=initializer))
-        params = parameters.create(y)
-        val = params["fc.w"]
-        assert val.shape == (3, 2)
-        expected = numpy.array([[1, 1], [1, 2], [1, 1]], numpy.float32)
-        assert numpy.logical_and.reduce(numpy.reshape(val == expected, 6))
-
-    def test_init_from_tar(self):
-        def get_param(names, size):
-            p = parameters.Parameters()
-            for k, v in zip(names, size):
-                p.__append_config__(__rand_param_config__(k, v))
-            for name in p.names():
-                param = p.get(name)
-                param[:] = numpy.random.uniform(
-                    -1.0, 1.0, size=p.get_shape(name))
-                p.set(name, param)
-            return p
-
-        def get_parames():
-            name1 = ['param_0', 'param_1']
-            size1 = [128, 256]
-            p1 = get_param(name1, size1)
-            file1 = cStringIO.StringIO()
-            p1.to_tar(file1)
-            file1.seek(0)
-
-            name2 = ['param_0', 'param_1', 'param_2']
-            size2 = [128, 256, 288]
-            p2 = get_param(name2, size2)
-            file2 = cStringIO.StringIO()
-            p2.to_tar(file2)
-            file2.seek(0)
-            return p1, file1, p2, file2
-
-        p1, file1, p2, file2 = get_parames()
-        p2.init_from_tar(file1)
-        for name in p1.names():
-            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
-            v1 = p1.get(name)
-            v2 = p2.get(name)
-            self.assertTrue(numpy.isclose(v1, v2).all())
-
-        p1, file1, p2, file2 = get_parames()
-        p1.init_from_tar(file2)
-        for name in p1.names():
-            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
-            v1 = p1.get(name)
-            v2 = p2.get(name)
-            self.assertTrue(numpy.isclose(v1, v2).all())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_rnn_layer.py b/python/paddle/v2/tests/test_rnn_layer.py
deleted file mode 100644
index 6ad07167d..000000000
--- a/python/paddle/v2/tests/test_rnn_layer.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import difflib
-import unittest
-
-import paddle.trainer_config_helpers as conf_helps
-import paddle.v2.activation as activation
-import paddle.v2.data_type as data_type
-import paddle.v2.layer as layer
-from paddle.trainer_config_helpers.config_parser_utils import \
-    parse_network_config as parse_network
-from paddle.trainer_config_helpers.config_parser_utils import \
-    reset_parser
-
-
-class RNNTest(unittest.TestCase):
-    def test_simple_rnn(self):
-        dict_dim = 10
-        word_dim = 8
-        hidden_dim = 8
-
-        def parse_old_rnn():
-            reset_parser()
-
-            def step(y):
-                mem = conf_helps.memory(name="rnn_state", size=hidden_dim)
-                out = conf_helps.fc_layer(
-                    input=[y, mem],
-                    size=hidden_dim,
-                    act=activation.Tanh(),
-                    bias_attr=True,
-                    name="rnn_state")
-                return out
-
-            def test():
-                data = conf_helps.data_layer(name="word", size=dict_dim)
-                embd = conf_helps.embedding_layer(input=data, size=word_dim)
-                conf_helps.recurrent_group(
-                    name="rnn", step=step, input=embd, reverse=True)
-
-            return str(parse_network(test))
-
-        def parse_new_rnn():
-            reset_parser()
-
-            def new_step(y):
-                mem = layer.memory(name="rnn_state", size=hidden_dim)
-                out = layer.fc(input=[y, mem],
-                               size=hidden_dim,
-                               act=activation.Tanh(),
-                               bias_attr=True,
-                               name="rnn_state")
-                return out
-
-            data = layer.data(
-                name="word", type=data_type.integer_value(dict_dim))
-            embd = layer.embedding(input=data, size=word_dim)
-            rnn_layer = layer.recurrent_group(
-                name="rnn", step=new_step, input=embd, reverse=True)
-            return str(layer.parse_network(rnn_layer))
-
-        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
-                                    parse_new_rnn().splitlines(1))
-        print ''.join(diff)
-
-    def test_sequence_rnn_multi_input(self):
-        dict_dim = 10
-        word_dim = 8
-        hidden_dim = 8
-        label_dim = 3
-
-        def parse_old_rnn():
-            reset_parser()
-
-            def test():
-                data = conf_helps.data_layer(name="word", size=dict_dim)
-                label = conf_helps.data_layer(name="label", size=label_dim)
-                emb = conf_helps.embedding_layer(input=data, size=word_dim)
-                boot_layer = conf_helps.data_layer(name="boot", size=10)
-                boot_layer = conf_helps.fc_layer(
-                    name='boot_fc', input=boot_layer, size=10)
-
-                def step(y, wid):
-                    z = conf_helps.embedding_layer(input=wid, size=word_dim)
-                    mem = conf_helps.memory(
-                        name="rnn_state",
-                        size=hidden_dim,
-                        boot_layer=boot_layer)
-                    out = conf_helps.fc_layer(
-                        input=[y, z, mem],
-                        size=hidden_dim,
-                        act=conf_helps.TanhActivation(),
-                        bias_attr=True,
-                        name="rnn_state")
-                    return out
-
-                out = conf_helps.recurrent_group(
-                    name="rnn", step=step, input=[emb, data])
-
-                rep = conf_helps.last_seq(input=out)
-                prob = conf_helps.fc_layer(
-                    size=label_dim,
-                    input=rep,
-                    act=conf_helps.SoftmaxActivation(),
-                    bias_attr=True)
-
-                conf_helps.outputs(
-                    conf_helps.classification_cost(
-                        input=prob, label=label))
-
-            return str(parse_network(test))
-
-        def parse_new_rnn():
-            reset_parser()
-            data = layer.data(
-                name="word", type=data_type.dense_vector(dict_dim))
-            label = layer.data(
-                name="label", type=data_type.dense_vector(label_dim))
-            emb = layer.embedding(input=data, size=word_dim)
-            boot_layer = layer.data(
-                name="boot", type=data_type.dense_vector(10))
-            boot_layer = layer.fc(name='boot_fc', input=boot_layer, size=10)
-
-            def step(y, wid):
-                z = layer.embedding(input=wid, size=word_dim)
-                mem = layer.memory(
-                    name="rnn_state", size=hidden_dim, boot_layer=boot_layer)
-                out = layer.fc(input=[y, z, mem],
-                               size=hidden_dim,
-                               act=activation.Tanh(),
-                               bias_attr=True,
-                               name="rnn_state")
-                return out
-
-            out = layer.recurrent_group(
-                name="rnn", step=step, input=[emb, data])
-
-            rep = layer.last_seq(input=out)
-            prob = layer.fc(size=label_dim,
-                            input=rep,
-                            act=activation.Softmax(),
-                            bias_attr=True)
-
-            cost = layer.classification_cost(input=prob, label=label)
-
-            return str(layer.parse_network(cost))
-
-        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
-                                    parse_new_rnn().splitlines(1))
-        print ''.join(diff)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/tests/test_topology.py b/python/paddle/v2/tests/test_topology.py
deleted file mode 100644
index bacd28ddb..000000000
--- a/python/paddle/v2/tests/test_topology.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import paddle.v2.layer as layer
-import paddle.v2.topology as topology
-import paddle.v2.data_type as data_type
-import paddle.trainer_config_helpers as conf_helps
-import paddle.trainer.PyDataProvider2 as pydp2
-
-
-class TestTopology(unittest.TestCase):
-    def test_data_type(self):
-        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
-        label = layer.data(name='label', type=data_type.integer_value(10))
-        hidden = layer.fc(input=pixel,
-                          size=100,
-                          act=conf_helps.SigmoidActivation())
-        inference = layer.fc(input=hidden,
-                             size=10,
-                             act=conf_helps.SoftmaxActivation())
-        cost = layer.classification_cost(input=inference, label=label)
-        topo = topology.Topology(cost)
-        data_types = topo.data_type()
-        self.assertEqual(len(data_types), 2)
-        pixel_data_type = filter(lambda type: type[0] == "pixel", data_types)
-        self.assertEqual(len(pixel_data_type), 1)
-        pixel_data_type = pixel_data_type[0]
-        self.assertEqual(pixel_data_type[1].type, pydp2.DataType.Dense)
-        self.assertEqual(pixel_data_type[1].dim, 784)
-
-        label_data_type = filter(lambda type: type[0] == "label", data_types)
-        self.assertEqual(len(label_data_type), 1)
-        label_data_type = label_data_type[0]
-        self.assertEqual(label_data_type[1].type, pydp2.DataType.Index)
-        self.assertEqual(label_data_type[1].dim, 10)
-
-    def test_get_layer(self):
-        pixel = layer.data(name='pixel2', type=data_type.dense_vector(784))
-        label = layer.data(name='label2', type=data_type.integer_value(10))
-        hidden = layer.fc(input=pixel,
-                          size=100,
-                          act=conf_helps.SigmoidActivation())
-        inference = layer.fc(input=hidden,
-                             size=10,
-                             act=conf_helps.SoftmaxActivation())
-        cost = layer.classification_cost(input=inference, label=label)
-        topo = topology.Topology(cost)
-        pixel_layer = topo.get_layer("pixel2")
-        label_layer = topo.get_layer("label2")
-        self.assertEqual(pixel_layer, pixel)
-        self.assertEqual(label_layer, label)
-
-    def test_parse(self):
-        pixel = layer.data(name='pixel3', type=data_type.dense_vector(784))
-        label = layer.data(name='label3', type=data_type.integer_value(10))
-        hidden = layer.fc(input=pixel,
-                          size=100,
-                          act=conf_helps.SigmoidActivation())
-        inference = layer.fc(input=hidden,
-                             size=10,
-                             act=conf_helps.SoftmaxActivation())
-        maxid = layer.max_id(input=inference)
-        cost1 = layer.classification_cost(input=inference, label=label)
-        cost2 = layer.cross_entropy_cost(input=inference, label=label)
-
-        topology.Topology(cost2).proto()
-        topology.Topology([cost1]).proto()
-        topology.Topology([cost1, cost2]).proto()
-        topology.Topology([inference, maxid]).proto()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
deleted file mode 100644
index 923ccecb0..000000000
--- a/python/paddle/v2/topology.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-
-from paddle.proto.ModelConfig_pb2 import ModelConfig
-import paddle.trainer_config_helpers as conf_helps
-import layer as v2_layer
-import config_base
-import cPickle
-from paddle.trainer import config_parser as cp
-
-__all__ = ['Topology']
-
-
-class Topology(object):
-    """
-    Topology is used to store the information about all layers
-    and network configs.
-    """
-
-    def __init__(self, layers, extra_layers=None):
-        def __check__(layers):
-            if not isinstance(layers, collections.Sequence):
-                layers = [layers]
-            for layer in layers:
-                __check_layer_type__(layer)
-            return layers
-
-        layers = __check__(layers)
-        self.layers = layers
-        if extra_layers is not None:
-            extra_layers = __check__(extra_layers)
-
-        self.__model_config__ = v2_layer.parse_network(
-            layers, extra_layers=extra_layers)
-
-        if extra_layers is not None:
-            self.layers.extend(extra_layers)
-
-        assert isinstance(self.__model_config__, ModelConfig)
-
-    def update_from_default(self):
-        # HACK(typhoonzero): update ParameterConfig(proto) in case of
-        # optimizers are defined after layers, or between layers.
-        # Must be called from trainer.__init__()
-        for parameter in self.__model_config__.parameters:
-            if parameter.momentum == 0.0 and cp.g_default_momentum:
-                parameter.momentum = cp.g_default_momentum
-            if parameter.decay_rate == 0.0 and cp.g_default_decay_rate:
-                parameter.decay_rate = cp.g_default_decay_rate
-            if parameter.initial_mean == 0.0:
-                parameter.initial_mean = cp.g_default_initial_mean
-            if parameter.initial_std == 0.01:
-                parameter.initial_std = cp.g_default_initial_std
-            if parameter.initial_strategy == 0:
-                parameter.initial_strategy = cp.g_default_initial_strategy
-            if parameter.initial_smart == False:
-                parameter.initial_smart = cp.g_default_initial_smart
-            if parameter.num_batches_regularization == 1 and \
-                cp.g_default_num_batches_regularization:
-                parameter.num_batches_regularization = \
-                    cp.g_default_num_batches_regularization
-            if parameter.gradient_clipping_threshold == 0.0 and \
-                cp.g_default_gradient_clipping_threshold:
-                parameter.gradient_clipping_threshold = \
-                    cp.g_default_gradient_clipping_threshold
-            if parameter.device == -1 and cp.g_default_device:
-                parameter.device = cp.g_default_device
-            # FIXME(typhoonzero): ignored: update_hooks, g_default_compact_func
-
-    def use_sparse_updater(self):
-        """
-        check if any parameter require to use sparse_update
-        :return:
-        """
-        use_sparse = False
-        for parameter in self.__model_config__.parameters:
-            if parameter.sparse_update or parameter.sparse_remote_update:
-                use_sparse = True
-                break
-        return use_sparse
-
-    def proto(self):
-        return self.__model_config__
-
-    def get_layer(self, name):
-        """
-        get v2.Layer Class instance by layer name
-        :param name:
-        :return:
-        """
-        return v2_layer.get_layer(name)
-
-    def data_layers(self):
-        """
-        get all data layer
-        :return:
-        """
-        data_layers = {}
-        for layer in self.proto().layers:
-            l = v2_layer.get_layer(layer.name)
-            if l and l.layer_type == conf_helps.LayerType.DATA:
-                data_layers[layer.name] = l
-        return data_layers
-
-    def data_type(self):
-        """
-        get data_type from proto, such as:
-        [('image', dense_vector(768)), ('label', integer_value(10))]
-        """
-        data_layers = self.data_layers()
-
-        return [(nm, data_layers[nm].data_type)
-                for nm in self.proto().input_layer_names]
-
-    def get_layer_proto(self, name):
-        for layer in self.__model_config__.layers:
-            if layer.name == name:
-                return layer
-        return None
-
-    def serialize_for_inference(self, stream):
-        protobin = self.proto().SerializeToString()
-        data_type = self.data_type()
-        cPickle.dump({
-            'protobin': protobin,
-            'data_type': data_type
-        }, stream, cPickle.HIGHEST_PROTOCOL)
-
-
-def __check_layer_type__(layer):
-    if not isinstance(layer, config_base.Layer):
-        raise ValueError('layer should have type paddle.v2.config_base.Layer')
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
deleted file mode 100644
index 5d98d5b6d..000000000
--- a/python/paddle/v2/trainer.py
+++ /dev/null
@@ -1,258 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Module Trainer
-"""
-import collections
-from topology import Topology
-from . import event as v2_event
-from . import optimizer as v2_optimizer
-from . import parameters as v2_parameters
-
-__all__ = ['SGD']
-
-
-def default_event_handler(event):
-    """
-    Default event handler. It will print some log and save mode.
-
-    TODO(yuyang18): Complete it!
-    :param event:
-    :return:
-    """
-    pass
-
-
-class SGD(object):
-    """
-    Simple SGD Trainer.
-    SGD Trainer combines data reader, network topolopy and update_equation together
-    to train/test a neural network.
-
-    :param cost: Target cost that neural network should be optimized.
-    :type cost: paddle.v2.config_base.Layer
-    :param parameters: The parameters dictionary.
-    :type parameters: paddle.v2.parameters.Parameters
-    :param update_equation: The optimizer object.
-    :type update_equation: paddle.v2.optimizer.Optimizer
-    :param extra_layers: Some layers in the neural network graph are not
-                         in the path of cost layer.
-    :type extra_layers: paddle.v2.config_base.Layer
-    :param is_local: Whether trainning locally
-    :type is_local: bool
-    :param pserver_spec: comma string for pserver location,
-                         eg:127.10.0.10:3000,127.10.0.11:3000,
-                         and this parameter is only used for fault
-                         tolerant mode cluster training.
-    :type pserver_spec: string
-    :param use_etcd: Whether using etcd pserver.
-    :param use_etcd: bool
-    """
-
-    def __init__(self,
-                 cost,
-                 parameters,
-                 update_equation,
-                 extra_layers=None,
-                 is_local=True,
-                 pserver_spec=None,
-                 use_etcd=True):
-
-        if not isinstance(parameters, v2_parameters.Parameters):
-            raise TypeError('parameters should be parameters')
-
-        if not isinstance(update_equation, v2_optimizer.Optimizer):
-            raise TypeError("update equation parameter must be "
-                            "paddle.v2.optimizer.Optimizer")
-        import py_paddle.swig_paddle as api
-        topology = Topology(cost, extra_layers=extra_layers)
-        # HACK(typhoonzero): update ParameterConfig(proto) in case of optimizers
-        # are defined after layers, or between layers.
-        topology.update_from_default()
-        parameters.update_param_conf(topology.proto())
-
-        self.__optimizer__ = update_equation
-        self.__topology__ = topology
-        self.__parameters__ = parameters
-        self.__topology_in_proto__ = topology.proto()
-        self.__is_local__ = is_local
-        self.__pserver_spec__ = pserver_spec
-        self.__use_etcd__ = use_etcd
-
-        self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
-        # # In local mode, disable sparse_remote_update.
-        if is_local:
-            for param in self.__topology_in_proto__.parameters:
-                if param.sparse_remote_update:
-                    param.sparse_remote_update = False
-
-        self.__gm_create_mode__ = api.CREATE_MODE_NORMAL if not \
-            self.__use_sparse_updater__ else api.CREATE_MODE_SGD_SPARSE_CPU_TRAINING
-        self.__data_types__ = topology.data_type()
-        gm = api.GradientMachine.createFromConfigProto(
-            self.__topology_in_proto__, self.__gm_create_mode__,
-            self.__optimizer__.enable_types())
-        assert isinstance(gm, api.GradientMachine)
-        self.__gradient_machine__ = gm
-        self.__gradient_machine__.randParameters()
-        self.__parameters__.append_gradient_machine(gm)
-        self.__parameter_updater__ = None
-
-    def get_topology_proto(self):
-        return self.__topology_in_proto__
-
-    def __use_remote_sparse_updater__(self):
-        return self.__use_sparse_updater__ and not self.__is_local__
-
-    def __prepare_parameter__(self, in_args):
-        """
-        prepare parameter before forward backward.
-        1. When use remote sparse updater, parameters should be got
-        from ps according to input arguments.
-        :param in_args: input arguments of this batch.
-        :return:
-        """
-        if self.__use_remote_sparse_updater__():
-            self.__gradient_machine__.prefetch(in_args)
-            self.__parameter_updater__.getParametersRemote()
-
-    def save_parameter_to_tar(self, f):
-        self.__parameter_updater__.catchUpWith()
-        self.__parameter_updater__.apply()
-        self.__parameter_updater__.getParametersRemote(True, True)
-        self.__parameters__.to_tar(f)
-        self.__parameter_updater__.restore()
-
-    def train(self, reader, num_passes=1, event_handler=None, feeding=None):
-        """
-        Training method. Will train num_passes of input data.
-
-        :param reader: A reader that reads and yeilds data items. Usually we use a
-                       batched reader to do mini-batch training.
-        :type reader: collections.Iterable
-        :param num_passes: The total train passes.
-        :param event_handler: Event handler. A method will be invoked when event
-                              occurred.
-        :type event_handler: (BaseEvent) => None
-        :param feeding: Feeding is a map of neural network input name and array
-                        index that reader returns.
-        :type feeding: dict|list
-        :return:
-        """
-        import py_paddle.swig_paddle as api
-        from data_feeder import DataFeeder
-        if event_handler is None:
-            event_handler = default_event_handler
-        __check_train_args__(**locals())
-
-        self.__parameter_updater__ = self.__optimizer__.create_updater(
-            self.__is_local__, num_passes, self.__use_sparse_updater__,
-            self.__pserver_spec__, self.__use_etcd__)
-        self.__parameter_updater__.init(self.__gradient_machine__)
-
-        self.__gradient_machine__.start()
-        batch_evaluator = self.__gradient_machine__.makeEvaluator()
-        assert isinstance(batch_evaluator, api.Evaluator)
-        pass_evaluator = self.__gradient_machine__.makeEvaluator()
-        assert isinstance(pass_evaluator, api.Evaluator)
-        out_args = api.Arguments.createArguments(0)
-        feeder = DataFeeder(self.__data_types__, feeding)
-        for pass_id in xrange(num_passes):
-            event_handler(v2_event.BeginPass(pass_id))
-            pass_evaluator.start()
-            self.__parameter_updater__.startPass()
-            for batch_id, data_batch in enumerate(reader()):
-                batch_evaluator.start()
-                event_handler(
-                    v2_event.BeginIteration(
-                        pass_id=pass_id, batch_id=batch_id))
-                pass_type = self.__parameter_updater__.startBatch(
-                    len(data_batch))
-                in_args = feeder(data_batch)
-                self.__prepare_parameter__(in_args)
-                self.__gradient_machine__.forwardBackward(in_args, out_args,
-                                                          pass_type)
-                self.__gradient_machine__.eval(pass_evaluator)
-                self.__gradient_machine__.eval(batch_evaluator)
-                event_handler(
-                    v2_event.EndForwardBackward(
-                        pass_id=pass_id,
-                        batch_id=batch_id,
-                        gm=self.__gradient_machine__))
-                for each_param in self.__gradient_machine__.getNonStaticParameters(
-                ):
-                    self.__parameter_updater__.update(each_param)
-                cost_sum = out_args.sum()
-                cost = cost_sum / len(data_batch)
-                self.__parameter_updater__.finishBatch(cost)
-                batch_evaluator.finish()
-                event_handler(
-                    v2_event.EndIteration(
-                        pass_id=pass_id,
-                        batch_id=batch_id,
-                        cost=cost,
-                        evaluator=batch_evaluator,
-                        gm=self.__gradient_machine__))
-
-            self.__parameter_updater__.finishPass()
-            pass_evaluator.finish()
-            event_handler(
-                v2_event.EndPass(
-                    pass_id,
-                    evaluator=pass_evaluator,
-                    gm=self.__gradient_machine__))
-        self.__gradient_machine__.finish()
-
-    def test(self, reader, feeding=None):
-        """
-        Testing method. Will test input data.
-
-        :param reader: A batch reader that reads and yeilds data items,
-                       it should be a paddle.v2.batch.
-        :type reader: collections.Iterable
-        :param feeding: Feeding is a map of neural network input name and array
-                        index that reader returns.
-        :type feeding: dict
-        :return:
-        """
-        import py_paddle.swig_paddle as api
-        from data_feeder import DataFeeder
-        feeder = DataFeeder(self.__data_types__, feeding)
-        evaluator = self.__gradient_machine__.makeEvaluator()
-        out_args = api.Arguments.createArguments(0)
-        evaluator.start()
-        total_cost = 0
-        num_samples = 0.0
-        for data_batch in reader():
-            num_samples += len(data_batch)
-            in_args = feeder(data_batch)
-            self.__prepare_parameter__(in_args)
-            self.__gradient_machine__.forward(in_args, out_args, api.PASS_TEST)
-            total_cost += out_args.sum()
-            self.__gradient_machine__.eval(evaluator)
-
-        evaluator.finish()
-        return v2_event.TestResult(
-            evaluator=evaluator, cost=total_cost / num_samples)
-
-
-def __check_train_args__(reader, event_handler, **kwargs):
-    """
-    Check train function's argument types
-    """
-    if not callable(reader) or not isinstance(reader(), collections.Iterator):
-        raise TypeError('train_data_reader should be a function, '
-                        'which can return a iterator')
-    if not callable(event_handler):
-        raise TypeError('event handler should be a function')
-- 
GitLab


From 1777017a05fc178a5861b1311b686da3e8ecd60a Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:41:09 +0800
Subject: [PATCH 3/9] remove legace go and proto code

---
 go/.gitignore                                 |   3 -
 go/CMakeLists.txt                             |  23 -
 go/cmd/master/CMakeLists.txt                  |  15 -
 go/cmd/master/master.go                       | 120 ---
 go/cmd/pserver/.gitignore                     |   1 -
 go/cmd/pserver/CMakeLists.txt                 |  15 -
 go/cmd/pserver/pserver.go                     | 108 ---
 go/connection/conn.go                         | 120 ---
 go/glide.lock                                 | 233 ------
 go/glide.yaml                                 |  33 -
 go/master/CMakeLists.txt                      |  17 -
 go/master/c/CMakeLists.txt                    |  15 -
 go/master/c/client.go                         | 196 -----
 go/master/client.go                           | 255 -------
 go/master/client_internal_test.go             | 152 ----
 go/master/client_test.go                      | 150 ----
 go/master/etcd_client.go                      | 201 -----
 go/master/inmem_store.go                      |  47 --
 go/master/service.go                          | 510 -------------
 go/master/service_internal_test.go            |  52 --
 go/master/service_test.go                     |  72 --
 go/proto/.gitignore                           |   4 -
 go/pserver/CMakeLists.txt                     |  17 -
 go/pserver/client/CMakeLists.txt              |  17 -
 go/pserver/client/c/.gitignore                |   1 -
 go/pserver/client/c/CMakeLists.txt            |  30 -
 go/pserver/client/c/cclient.go                | 300 --------
 go/pserver/client/c/test/CMakeLists.txt       |  15 -
 go/pserver/client/c/test/test_cclient.c       | 115 ---
 go/pserver/client/c/test/test_mnist.py        | 145 ----
 go/pserver/client/c/test/test_train.py        |  89 ---
 .../client/c/test/testdata/optimizer.pb       | Bin 50 -> 0 bytes
 go/pserver/client/client.go                   | 237 ------
 go/pserver/client/client_test.go              | 268 -------
 go/pserver/client/etcd_client.go              | 266 -------
 go/pserver/client/etcd_client_test.go         | 106 ---
 go/pserver/etcd_client.go                     | 253 -------
 go/pserver/optimizer.go                       | 132 ----
 go/pserver/optimizer_test.go                  |  78 --
 go/pserver/service.go                         | 450 -----------
 go/pserver/service_internal_test.go           |  86 ---
 go/pserver/service_test.go                    | 211 ------
 go/utils/networkhelper/CMakeLists.txt         |  17 -
 go/utils/networkhelper/helper.go              |  59 --
 go/utils/networkhelper/helper_test.go         |  24 -
 proto/.gitignore                              |   1 -
 proto/CMakeLists.txt                          |  57 --
 proto/DataConfig.proto                        |  86 ---
 proto/DataFormat.proto                        |  76 --
 proto/ModelConfig.proto                       | 698 ------------------
 proto/OptimizerConfig.proto                   | 164 ----
 proto/ParameterConfig.proto                   |  83 ---
 proto/ParameterServerConfig.proto             |  50 --
 proto/ParameterService.proto                  | 351 ---------
 proto/README.md                               |   3 -
 proto/TrainerConfig.proto                     | 160 ----
 56 files changed, 6987 deletions(-)
 delete mode 100644 go/.gitignore
 delete mode 100644 go/CMakeLists.txt
 delete mode 100644 go/cmd/master/CMakeLists.txt
 delete mode 100644 go/cmd/master/master.go
 delete mode 100644 go/cmd/pserver/.gitignore
 delete mode 100644 go/cmd/pserver/CMakeLists.txt
 delete mode 100644 go/cmd/pserver/pserver.go
 delete mode 100644 go/connection/conn.go
 delete mode 100644 go/glide.lock
 delete mode 100644 go/glide.yaml
 delete mode 100644 go/master/CMakeLists.txt
 delete mode 100644 go/master/c/CMakeLists.txt
 delete mode 100644 go/master/c/client.go
 delete mode 100644 go/master/client.go
 delete mode 100644 go/master/client_internal_test.go
 delete mode 100644 go/master/client_test.go
 delete mode 100644 go/master/etcd_client.go
 delete mode 100644 go/master/inmem_store.go
 delete mode 100644 go/master/service.go
 delete mode 100644 go/master/service_internal_test.go
 delete mode 100644 go/master/service_test.go
 delete mode 100644 go/proto/.gitignore
 delete mode 100644 go/pserver/CMakeLists.txt
 delete mode 100644 go/pserver/client/CMakeLists.txt
 delete mode 100644 go/pserver/client/c/.gitignore
 delete mode 100644 go/pserver/client/c/CMakeLists.txt
 delete mode 100644 go/pserver/client/c/cclient.go
 delete mode 100644 go/pserver/client/c/test/CMakeLists.txt
 delete mode 100644 go/pserver/client/c/test/test_cclient.c
 delete mode 100644 go/pserver/client/c/test/test_mnist.py
 delete mode 100644 go/pserver/client/c/test/test_train.py
 delete mode 100644 go/pserver/client/c/test/testdata/optimizer.pb
 delete mode 100644 go/pserver/client/client.go
 delete mode 100644 go/pserver/client/client_test.go
 delete mode 100644 go/pserver/client/etcd_client.go
 delete mode 100644 go/pserver/client/etcd_client_test.go
 delete mode 100644 go/pserver/etcd_client.go
 delete mode 100644 go/pserver/optimizer.go
 delete mode 100644 go/pserver/optimizer_test.go
 delete mode 100644 go/pserver/service.go
 delete mode 100644 go/pserver/service_internal_test.go
 delete mode 100644 go/pserver/service_test.go
 delete mode 100644 go/utils/networkhelper/CMakeLists.txt
 delete mode 100644 go/utils/networkhelper/helper.go
 delete mode 100644 go/utils/networkhelper/helper_test.go
 delete mode 100644 proto/.gitignore
 delete mode 100644 proto/CMakeLists.txt
 delete mode 100644 proto/DataConfig.proto
 delete mode 100644 proto/DataFormat.proto
 delete mode 100644 proto/ModelConfig.proto
 delete mode 100644 proto/OptimizerConfig.proto
 delete mode 100644 proto/ParameterConfig.proto
 delete mode 100644 proto/ParameterServerConfig.proto
 delete mode 100644 proto/ParameterService.proto
 delete mode 100644 proto/README.md
 delete mode 100644 proto/TrainerConfig.proto

diff --git a/go/.gitignore b/go/.gitignore
deleted file mode 100644
index 398d70ca3..000000000
--- a/go/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-vendor/
-.glide/
-proto/*.go
diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt
deleted file mode 100644
index f3a9296c2..000000000
--- a/go/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-add_subdirectory(pserver/client/c)
-add_subdirectory(cmd/pserver)
-add_subdirectory(cmd/master)
-add_subdirectory(master/c)
-add_subdirectory(master)
-add_subdirectory(pserver)
-add_subdirectory(pserver/client)
-add_subdirectory(utils/networkhelper)
diff --git a/go/cmd/master/CMakeLists.txt b/go/cmd/master/CMakeLists.txt
deleted file mode 100644
index fc99d8d3b..000000000
--- a/go/cmd/master/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-go_binary(master SRC master.go)
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
deleted file mode 100644
index 537df59c8..000000000
--- a/go/cmd/master/master.go
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"fmt"
-	"net"
-	"net/http"
-	"net/rpc"
-	"os"
-	"os/signal"
-	"strconv"
-	"strings"
-	"time"
-
-	log "github.com/inconshreveable/log15"
-	"github.com/namsral/flag"
-
-	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
-)
-
-func main() {
-	port := flag.Int("port", 8080, "port of the master server.")
-	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
-	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
-	logLevel := flag.String("log-level", "info",
-		"log level, possible values: debug, info, warn, error, crit")
-	flag.Parse()
-
-	lvl, err := log.LvlFromString(*logLevel)
-	if err != nil {
-		panic(err)
-	}
-
-	log.Root().SetHandler(
-		log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)),
-	)
-
-	if *endpoints == "" {
-		log.Warn("-endpoints not set, fault tolerance not be enabled.")
-	}
-
-	var store master.Store
-	if *endpoints != "" {
-		eps := strings.Split(*endpoints, ",")
-		ip, err := networkhelper.GetExternalIP()
-		if err != nil {
-			log.Crit("get external ip error", log.Ctx{"error": err})
-			panic(err)
-		}
-
-		addr := fmt.Sprintf("%s:%d", ip, *port)
-		store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec)
-		if err != nil {
-			log.Crit("error creating etcd client.", log.Ctx{"error": err})
-			panic(err)
-		}
-	} else {
-		store = &master.InMemStore{}
-	}
-
-	shutdown := func() {
-		log.Info("shutting down gracefully")
-		err := store.Shutdown()
-		if err != nil {
-			log.Error("shutdown error", log.Ctx{"error": err})
-		}
-	}
-
-	// Guaranteed to run even panic happens.
-	defer shutdown()
-
-	c := make(chan os.Signal, 1)
-	signal.Notify(c, os.Interrupt)
-
-	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
-	if err != nil {
-		log.Crit("error creating new service.", log.Ctx{"error": err})
-		panic(err)
-	}
-
-	err = rpc.Register(s)
-	if err != nil {
-		log.Crit("error registering to etcd.", log.Ctx{"error": err})
-		panic(err)
-	}
-
-	rpc.HandleHTTP()
-	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	if err != nil {
-		log.Crit("error listing to port", log.Ctx{"error": err, "port": *port})
-		panic(err)
-	}
-
-	go func() {
-		err = http.Serve(l, nil)
-		if err != nil {
-			log.Crit("error serving HTTP", log.Ctx{"error": err})
-			panic(err)
-		}
-	}()
-
-	<-c
-}
diff --git a/go/cmd/pserver/.gitignore b/go/cmd/pserver/.gitignore
deleted file mode 100644
index fffd9adc4..000000000
--- a/go/cmd/pserver/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-pserver
diff --git a/go/cmd/pserver/CMakeLists.txt b/go/cmd/pserver/CMakeLists.txt
deleted file mode 100644
index 20d033c93..000000000
--- a/go/cmd/pserver/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
deleted file mode 100644
index 271274caf..000000000
--- a/go/cmd/pserver/pserver.go
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-import (
-	"net"
-	"net/http"
-	"net/rpc"
-	"os"
-	"os/signal"
-	"strconv"
-	"time"
-
-	"github.com/namsral/flag"
-	"github.com/topicai/candy"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	log "github.com/inconshreveable/log15"
-)
-
-func main() {
-	port := flag.Int("port", 8001, "port of the pserver")
-	index := flag.Int("index", -1, "index of the pserver, set to -1 if use etcd for auto pserver index registry")
-	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
-		"comma separated endpoint string for pserver to connect to etcd")
-	dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout")
-	etcdTTL := flag.Int("etcd-ttl", 5, "etcd time to live in seconds")
-	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
-	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
-	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
-	logLevel := flag.String("log-level", "info",
-		"log level, possible values: debug, info, warn, error, crit")
-	flag.Parse()
-
-	lvl, err := log.LvlFromString(*logLevel)
-	if err != nil {
-		panic(err)
-	}
-
-	log.Root().SetHandler(
-		log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)),
-	)
-
-	var idx int
-
-	var cp pserver.Checkpoint
-	var e *pserver.EtcdClient
-	if *index >= 0 {
-		idx = *index
-	} else {
-		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *dialTimeout, *etcdTTL)
-		idx, err = e.Register(*port)
-		candy.Must(err)
-
-		cp, err = pserver.LoadCheckpoint(e, idx)
-		if err != nil {
-			if err == pserver.ErrCheckpointNotFound {
-				log.Info("load checkpoint error", "error", err)
-			} else {
-				panic(err)
-			}
-		}
-	}
-
-	shutdown := func() {
-		log.Info("shutting down gracefully")
-		sErr := e.Shutdown()
-		if sErr != nil {
-			log.Error("error shutting down", log.Ctx{"error": sErr})
-		}
-	}
-
-	// Guaranteed to run even panic happens.
-	defer shutdown()
-
-	c := make(chan os.Signal, 1)
-	signal.Notify(c, os.Interrupt)
-
-	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
-	candy.Must(err)
-
-	err = rpc.Register(s)
-	candy.Must(err)
-
-	rpc.HandleHTTP()
-	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	candy.Must(err)
-
-	go func() {
-		log.Info("serving pserver", log.Ctx{"port": *port})
-		err = http.Serve(l, nil)
-		candy.Must(err)
-	}()
-
-	<-c
-}
diff --git a/go/connection/conn.go b/go/connection/conn.go
deleted file mode 100644
index b8353e8e1..000000000
--- a/go/connection/conn.go
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package connection
-
-import (
-	"errors"
-	"net/rpc"
-	"sync"
-
-	log "github.com/sirupsen/logrus"
-)
-
-// TODO(helin): add TCP re-connect logic
-
-// Conn is a connection to a parameter server
-type Conn struct {
-	mu       sync.Mutex
-	client   *rpc.Client
-	waitConn chan struct{}
-}
-
-// New creates a new connection.
-func New() *Conn {
-	c := &Conn{}
-	return c
-}
-
-// Close closes the connection.
-func (c *Conn) Close() error {
-	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	if c.client == nil {
-		return nil
-	}
-
-	return c.client.Close()
-}
-
-// Connect connects the connection to a address.
-func (c *Conn) Connect(addr string) error {
-	c.mu.Lock()
-	if c.client != nil {
-		err := c.client.Close()
-		if err != nil {
-			c.mu.Unlock()
-			return err
-		}
-
-		c.client = nil
-	}
-	c.mu.Unlock()
-
-	client, err := rpc.DialHTTP("tcp", addr)
-	if err != nil {
-		return err
-	}
-
-	c.mu.Lock()
-	defer c.mu.Unlock()
-
-	if c.client == nil {
-		c.client = client
-		if c.waitConn != nil {
-			close(c.waitConn)
-			c.waitConn = nil
-		}
-	} else {
-		err := client.Close()
-		if err != nil {
-			log.Errorln(err)
-		}
-
-		return errors.New("client already set from a concurrent goroutine")
-	}
-
-	return nil
-}
-
-// TODO(helin): refactor Call to be able to perform given retry
-// policy.
-
-// Call make a RPC call.
-//
-// Call will be blocked until the connection to remote RPC service
-// being established.
-func (c *Conn) Call(serviceMethod string, args interface{}, reply interface{}) error {
-	c.mu.Lock()
-	client := c.client
-	var waitCh chan struct{}
-	if client == nil {
-		if c.waitConn != nil {
-			waitCh = c.waitConn
-		} else {
-			waitCh = make(chan struct{})
-			c.waitConn = waitCh
-		}
-	}
-	c.mu.Unlock()
-
-	if waitCh != nil {
-		// wait until new connection being established
-		<-waitCh
-		return c.Call(serviceMethod, args, reply)
-	}
-
-	return client.Call(serviceMethod, args, reply)
-}
diff --git a/go/glide.lock b/go/glide.lock
deleted file mode 100644
index d15fc934d..000000000
--- a/go/glide.lock
+++ /dev/null
@@ -1,233 +0,0 @@
-hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19
-updated: 2017-10-30T03:46:19.137696069Z
-imports:
-- name: github.com/alecthomas/gometalinter
-  version: bae2f1293d092fd8167939d5108d1b025eaef9de
-- name: github.com/beorn7/perks
-  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
-  subpackages:
-  - quantile
-- name: github.com/boltdb/bolt
-  version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
-- name: github.com/cockroachdb/cmux
-  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
-- name: github.com/coreos/etcd
-  version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b
-  subpackages:
-  - alarm
-  - auth
-  - auth/authpb
-  - client
-  - clientv3
-  - clientv3/concurrency
-  - compactor
-  - discovery
-  - embed
-  - error
-  - etcdserver
-  - etcdserver/api
-  - etcdserver/api/etcdhttp
-  - etcdserver/api/v2http
-  - etcdserver/api/v2http/httptypes
-  - etcdserver/api/v3client
-  - etcdserver/api/v3election
-  - etcdserver/api/v3election/v3electionpb
-  - etcdserver/api/v3election/v3electionpb/gw
-  - etcdserver/api/v3lock
-  - etcdserver/api/v3lock/v3lockpb
-  - etcdserver/api/v3lock/v3lockpb/gw
-  - etcdserver/api/v3rpc
-  - etcdserver/api/v3rpc/rpctypes
-  - etcdserver/auth
-  - etcdserver/etcdserverpb
-  - etcdserver/etcdserverpb/gw
-  - etcdserver/membership
-  - etcdserver/stats
-  - lease
-  - lease/leasehttp
-  - lease/leasepb
-  - mvcc
-  - mvcc/backend
-  - mvcc/mvccpb
-  - pkg/adt
-  - pkg/contention
-  - pkg/cors
-  - pkg/cpuutil
-  - pkg/crc
-  - pkg/debugutil
-  - pkg/fileutil
-  - pkg/httputil
-  - pkg/idutil
-  - pkg/ioutil
-  - pkg/logutil
-  - pkg/monotime
-  - pkg/netutil
-  - pkg/pathutil
-  - pkg/pbutil
-  - pkg/runtime
-  - pkg/schedule
-  - pkg/srv
-  - pkg/tlsutil
-  - pkg/transport
-  - pkg/types
-  - pkg/wait
-  - proxy/grpcproxy/adapter
-  - raft
-  - raft/raftpb
-  - rafthttp
-  - snap
-  - snap/snappb
-  - store
-  - version
-  - wal
-  - wal/walpb
-- name: github.com/coreos/go-semver
-  version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
-  subpackages:
-  - semver
-- name: github.com/coreos/go-systemd
-  version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
-  subpackages:
-  - daemon
-  - journal
-  - util
-- name: github.com/coreos/pkg
-  version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
-  subpackages:
-  - capnslog
-- name: github.com/dgrijalva/jwt-go
-  version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
-- name: github.com/ghodss/yaml
-  version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
-- name: github.com/go-stack/stack
-  version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf
-- name: github.com/gogo/protobuf
-  version: 909568be09de550ed094403c2bf8a261b5bb730a
-  subpackages:
-  - proto
-- name: github.com/golang/protobuf
-  version: 4bd1920723d7b7c925de087aa32e2187708897f7
-  subpackages:
-  - jsonpb
-  - proto
-- name: github.com/golang/snappy
-  version: 553a641470496b2327abcac10b36396bd98e45c9
-- name: github.com/google/btree
-  version: 925471ac9e2131377a91e1595defec898166fe49
-- name: github.com/grpc-ecosystem/go-grpc-prometheus
-  version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
-- name: github.com/grpc-ecosystem/grpc-gateway
-  version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
-  subpackages:
-  - runtime
-  - runtime/internal
-  - utilities
-- name: github.com/inconshreveable/log15
-  version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3
-- name: github.com/jonboulle/clockwork
-  version: 2eee05ed794112d45db504eb05aa693efd2b8b09
-- name: github.com/mattn/go-colorable
-  version: 5411d3eea5978e6cdc258b30de592b60df6aba96
-- name: github.com/mattn/go-isatty
-  version: 57fdcb988a5c543893cc61bce354a6e24ab70022
-- name: github.com/matttproud/golang_protobuf_extensions
-  version: c12348ce28de40eed0136aa2b644d0ee0650e56c
-  subpackages:
-  - pbutil
-- name: github.com/namsral/flag
-  version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
-- name: github.com/PaddlePaddle/recordio
-  version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
-- name: github.com/prometheus/client_golang
-  version: c5b7fccd204277076155f10851dad72b76a49317
-  subpackages:
-  - prometheus
-- name: github.com/prometheus/client_model
-  version: 6f3806018612930941127f2a7c6c453ba2c527d2
-  subpackages:
-  - go
-- name: github.com/prometheus/common
-  version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
-  subpackages:
-  - expfmt
-  - internal/bitbucket.org/ww/goautoneg
-  - model
-- name: github.com/prometheus/procfs
-  version: a1dba9ce8baed984a2495b658c82687f8157b98f
-  subpackages:
-  - xfs
-- name: github.com/satori/go.uuid
-  version: 879c5887cd475cd7864858769793b2ceb0d44feb
-- name: github.com/sirupsen/logrus
-  version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e
-- name: github.com/topicai/candy
-  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
-- name: github.com/ugorji/go
-  version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
-  subpackages:
-  - codec
-- name: github.com/xiang90/probing
-  version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
-- name: golang.org/x/crypto
-  version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3
-  repo: https://github.com/golang/crypto.git
-  vcs: git
-  subpackages:
-  - bcrypt
-  - blowfish
-  - ssh/terminal
-- name: golang.org/x/net
-  version: c8c74377599bd978aee1cf3b9b63a8634051cec2
-  subpackages:
-  - context
-  - http2
-  - http2/hpack
-  - idna
-  - internal/timeseries
-  - lex/httplex
-  - trace
-- name: golang.org/x/sys
-  version: e48874b42435b4347fc52bdee0424a52abc974d7
-  repo: https://github.com/golang/sys.git
-  vcs: git
-  subpackages:
-  - unix
-  - windows
-- name: golang.org/x/text
-  version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
-  repo: https://github.com/golang/text.git
-  vcs: git
-  subpackages:
-  - secure/bidirule
-  - transform
-  - unicode/bidi
-  - unicode/norm
-- name: google.golang.org/grpc
-  version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d
-  subpackages:
-  - codes
-  - credentials
-  - grpclog
-  - internal
-  - keepalive
-  - metadata
-  - naming
-  - peer
-  - stats
-  - tap
-  - transport
-- name: gopkg.in/yaml.v2
-  version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
-testImports:
-- name: github.com/davecgh/go-spew
-  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
-  subpackages:
-  - spew
-- name: github.com/pmezard/go-difflib
-  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
-  subpackages:
-  - difflib
-- name: github.com/stretchr/testify
-  version: 05e8a0eda380579888eb53c394909df027f06991
-  subpackages:
-  - assert
diff --git a/go/glide.yaml b/go/glide.yaml
deleted file mode 100644
index c5d66694a..000000000
--- a/go/glide.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-package: github.com/PaddlePaddle/Paddle/go
-import:
-- package: github.com/PaddlePaddle/recordio
-- package: github.com/coreos/etcd
-  version: ^3.2.1
-  subpackages:
-  - clientv3
-  - clientv3/concurrency
-  - embed
-  - etcdserver
-- package: github.com/namsral/flag
-  version: ^1.7.4-pre
-- package: github.com/sirupsen/logrus
-  version: ^1.0.0
-- package: github.com/topicai/candy
-- package: golang.org/x/crypto
-  repo: https://github.com/golang/crypto.git
-  vcs: git
-- package: golang.org/x/sys
-  repo: https://github.com/golang/sys.git
-  vcs: git
-- package: golang.org/x/text
-  repo: https://github.com/golang/text.git
-  vcs: git
-- package: github.com/satori/go.uuid
-  version: v1.1.0
-- package: github.com/alecthomas/gometalinter
-  version: v1.2.1
-- package: github.com/inconshreveable/log15
-  version: v2.13
-- package: github.com/go-stack/stack
-  version: v1.6.0
-- package: github.com/golang/protobuf
diff --git a/go/master/CMakeLists.txt b/go/master/CMakeLists.txt
deleted file mode 100644
index b5101c347..000000000
--- a/go/master/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-if(WITH_TESTING)
-  go_test(master_test)
-endif()
diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt
deleted file mode 100644
index 58b44e644..000000000
--- a/go/master/c/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-go_library(paddle_master SHARED DEPS paddle_go_optimizer)
diff --git a/go/master/c/client.go b/go/master/c/client.go
deleted file mode 100644
index 42c176d00..000000000
--- a/go/master/c/client.go
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-/*
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#define PADDLE_MASTER_OK    0
-#define PADDLE_MASTER_ERROR -1
-
-#define PADDLE_SAVE_MODEL_OK   1
-#define PADDLE_SAVE_MODEL_SKIP 0
-
-typedef int paddle_master_client;
-*/
-import "C"
-
-import (
-	"strings"
-	"sync"
-	"time"
-	"unsafe"
-
-	"github.com/PaddlePaddle/Paddle/go/master"
-	log "github.com/inconshreveable/log15"
-)
-
-var mu sync.Mutex
-var handleMap = make(map[C.paddle_master_client]*master.Client)
-var curHandle C.paddle_master_client
-
-func init() {
-	log.Root().SetHandler(
-		log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)),
-	)
-}
-
-func add(c *master.Client) C.paddle_master_client {
-	mu.Lock()
-	defer mu.Unlock()
-	client := curHandle
-	curHandle++
-	handleMap[client] = c
-	return client
-}
-
-func get(client C.paddle_master_client) *master.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	return handleMap[client]
-}
-
-func remove(client C.paddle_master_client) *master.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	h := handleMap[client]
-	delete(handleMap, client)
-	return h
-}
-
-//export paddle_new_etcd_master_client
-//
-// bufSize is the record buffer size.
-func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
-	p := C.GoString(etcdEndpoints)
-	endpoints := strings.Split(p, ",")
-	c, err := master.NewClient(
-		master.WithEtcd(endpoints, time.Duration(timeout)*time.Second),
-		master.WithBuffer(bufSize),
-	)
-	if err != nil {
-		panic(err)
-	}
-
-	return add(c)
-}
-
-//export paddle_new_master_client
-//
-// bufSize is the record buffer size.
-func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
-	a := C.GoString(addr)
-	c, err := master.NewClient(master.WithAddr(a), master.WithBuffer(bufSize))
-	if err != nil {
-		panic(err)
-	}
-
-	return add(c)
-}
-
-//export paddle_release_master_client
-func paddle_release_master_client(client C.paddle_master_client) {
-	remove(client)
-}
-
-//export paddle_start_get_records
-func paddle_start_get_records(client C.paddle_master_client, pass C.int) {
-	c := get(client)
-	c.StartGetRecords(int(pass))
-}
-
-//export paddle_set_dataset
-func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
-	c := get(client)
-	var paths []string
-	for i := 0; i < int(size); i++ {
-		ptr := (**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(path)) + uintptr(i)*unsafe.Sizeof(*path)))
-		str := C.GoString(*ptr)
-		paths = append(paths, str)
-	}
-	err := c.SetDataset(paths)
-	if err != nil {
-		log.Error("error set dataset",
-			log.Ctx{"error": err, "paths": paths})
-		return C.PADDLE_MASTER_ERROR
-	}
-
-	return C.PADDLE_MASTER_OK
-}
-
-// paddle_next_record gets the nexts training record.
-//
-// returns number of bytes of the records if success, -1 if failed, -2 if pass end.
-//
-//export paddle_next_record
-func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
-	c := get(client)
-	r, err := c.NextRecord()
-	if err != nil {
-		// NOTE: use errors to indicate pass ends
-		if err.Error() == master.ErrAllTaskFailed.Error() ||
-			err.Error() == master.ErrNoMoreAvailable.Error() ||
-			err.Error() == master.ErrPassBefore.Error() {
-			return -2
-		}
-		*record = (*C.uchar)(nil)
-		return -1
-	}
-
-	if len(r) == 0 {
-		// Empty record
-		*record = (*C.uchar)(nil)
-		return 0
-	}
-
-	size := C.size_t(len(r))
-	*record = (*C.uchar)(C.malloc(size))
-	C.memcpy(unsafe.Pointer(*record), unsafe.Pointer(&r[0]), size)
-	return C.int(size)
-}
-
-// paddle_request_save_model requests the master server to approve the
-// caller to save the model.
-//
-// returns 1 if the save the model request is approved, 0 if the
-// request is rejected because other trainer is saving the model, -1
-// if error happened.
-//
-//export paddle_request_save_model
-func paddle_request_save_model(client C.paddle_master_client, trainerID string, blockMS int) C.int {
-	c := get(client)
-	need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
-	if err != nil {
-		log.Error("error request save model", log.Ctx{"error": err})
-		return C.PADDLE_MASTER_ERROR
-	}
-
-	if need {
-		return C.PADDLE_SAVE_MODEL_OK
-	}
-
-	return C.PADDLE_SAVE_MODEL_SKIP
-}
-
-//export mem_free
-func mem_free(p unsafe.Pointer) {
-	// "free" may be a better name for this function, but doing so
-	// will cause calling any function of this library from Python
-	// ctypes hanging.
-	C.free(p)
-}
-
-func main() {}
diff --git a/go/master/client.go b/go/master/client.go
deleted file mode 100644
index e43903dd1..000000000
--- a/go/master/client.go
+++ /dev/null
@@ -1,255 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import (
-	"os"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/connection"
-	"github.com/PaddlePaddle/recordio"
-	"github.com/coreos/etcd/clientv3"
-	log "github.com/inconshreveable/log15"
-)
-
-// Client is the client of the master server.
-type Client struct {
-	conn    *connection.Conn
-	ch      chan record
-	bufSize int
-}
-
-type record struct {
-	r   []byte
-	err error
-}
-
-// WithBuffer sets the client to buffer the training record.
-//
-// bufSize is the record buffer size. NextRecord will read from this
-// buffer.
-func WithBuffer(bufSize int) func(*Client) error {
-	return func(c *Client) error {
-		if bufSize <= 0 {
-			return nil
-		}
-		c.bufSize = bufSize
-		return nil
-	}
-}
-
-// WithAddr sets the client to use fixed master address.
-func WithAddr(addr string) func(c *Client) error {
-	return func(c *Client) error {
-		ch := make(chan string, 1)
-		ch <- addr
-		go c.monitorMaster(ch)
-		return nil
-	}
-}
-
-// WithEtcd sets the client to use etcd for master discovery.
-func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
-	return func(c *Client) error {
-		var cli *clientv3.Client
-		f := func() error {
-			var err error
-			cli, err = clientv3.New(clientv3.Config{
-				Endpoints:   endpoints,
-				DialTimeout: timeout,
-			})
-			return err
-		}
-		for {
-			err := f()
-			if err != nil {
-				log.Warn("create etcd client error", log.Ctx{"error": err})
-			} else {
-				break
-			}
-			time.Sleep(time.Second)
-		}
-
-		ch := make(chan string, 1)
-		a, err := GetKey(cli, DefaultAddrPath, timeout)
-		if err != nil {
-			return err
-		}
-
-		if a != "" {
-			// Master is registered, send to the master address
-			// channel.
-			ch <- a
-		}
-
-		go watchKey(cli, DefaultAddrPath, ch)
-		go c.monitorMaster(ch)
-		return nil
-	}
-}
-
-// NewClient creates a new Client.
-func NewClient(opts ...func(*Client) error) (*Client, error) {
-	c := &Client{}
-	c.conn = connection.New()
-
-	for _, opt := range opts {
-		err := opt(c)
-		if err != nil {
-			return nil, err
-		}
-	}
-	c.ch = make(chan record, c.bufSize)
-	return c, nil
-}
-
-// StartGetRecords must be called at beginning of each pass
-func (c *Client) StartGetRecords(passID int) {
-	go c.getRecords(passID)
-}
-
-func (c *Client) getRecords(passID int) {
-	i := 0
-	for {
-		t, err := c.getTask(passID)
-		if err != nil {
-			if err.Error() == ErrPassBefore.Error() ||
-				err.Error() == ErrNoMoreAvailable.Error() ||
-				err.Error() == ErrAllTaskFailed.Error() {
-				c.ch <- record{nil, err}
-				break
-			}
-
-			if i%60 == 0 {
-				log.Debug("getTask of passID error.",
-					log.Ctx{"error": err, "passID": passID})
-				i = 0
-			}
-
-			// if err.Error() == ErrPassAfter.Error()
-			//   wait util last pass finishes
-			// if other error such as network error
-			//   wait to reconnect or task time out
-			time.Sleep(time.Second * 3)
-			i += 3
-			continue
-		}
-
-		for _, chunk := range t.Chunks {
-			f, e := os.Open(chunk.Path)
-			if e != nil {
-				log.Error("error open chunk", log.Ctx{"error": e})
-				continue
-			}
-
-			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
-			for s.Scan() {
-				c.ch <- record{s.Record(), nil}
-			}
-
-			if s.Err() != nil {
-				c.ch <- record{nil, s.Err()}
-				log.Error(
-					"error scan chunk",
-					log.Ctx{"error": err, "path": chunk.Path},
-				)
-			}
-
-			err = f.Close()
-			if err != nil {
-				log.Error("error close record file", log.Ctx{"error": err})
-			}
-		}
-
-		// We treat a task as finished whenever the last data
-		// instance of the task is read. This is not exactly
-		// correct, but a reasonable approximation.
-		err = c.taskFinished(t.Meta.ID)
-		if err != nil {
-			log.Error("task finish callback error.", log.Ctx{"error": err})
-		}
-	}
-}
-
-func (c *Client) monitorMaster(addrCh <-chan string) {
-	lastMaster := ""
-	for curMaster := range addrCh {
-		// connect to the new address once address changed.
-		if curMaster != lastMaster {
-			if curMaster == "" {
-				err := c.conn.Close()
-				if err != nil {
-					log.Error("close old master addr error", log.Ctx{"error": err})
-				}
-			} else {
-				err := c.conn.Connect(curMaster)
-				if err != nil {
-					log.Error("connect to new master addr error", log.Ctx{"error": err})
-
-					// connect to addr failed, set
-					// to last known addr in order
-					// to retry next time.
-					curMaster = lastMaster
-				}
-			}
-		}
-		lastMaster = curMaster
-	}
-}
-
-// SetDataset sets dataset to dispatch for the master server.
-//
-// SetDataset can be call multiple times at one pass. But only the first call
-// will be honored.
-//
-// After all tasks are done, another call of SetDataset will start another pass.
-func (c *Client) SetDataset(globPaths []string) error {
-	err := c.conn.Call("Service.SetDataset", globPaths, nil)
-	return err
-}
-
-// getTask gets a new task from the master server.
-func (c *Client) getTask(passID int) (Task, error) {
-	var t Task
-	err := c.conn.Call("Service.GetTask", passID, &t)
-	return t, err
-}
-
-// TaskFinished tells the master server a task is finished.
-func (c *Client) taskFinished(taskID int) error {
-	return c.conn.Call("Service.TaskFinished", taskID, nil)
-}
-
-// TaskFailed tell the master server as task is failed.
-func (c *Client) taskFailed(meta TaskMeta) error {
-	return c.conn.Call("Service.TaskFailed", meta, nil)
-}
-
-// NextRecord returns next record in the dataset.
-//
-// NextRecord will block until the next record is available. It is
-// thread-safe.
-func (c *Client) NextRecord() ([]byte, error) {
-	r := <-c.ch
-	return r.r, r.err
-}
-
-// RequestSaveModel requests the master server to approve the caller
-// to save the model.
-func (c *Client) RequestSaveModel(trainerID string, blockDur time.Duration) (bool, error) {
-	var need bool
-	err := c.conn.Call("Service.RequestSaveModel", SaveModelRequest{TrainerID: trainerID, BlockDur: blockDur}, &need)
-	return need, err
-}
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
deleted file mode 100644
index 37028a9e1..000000000
--- a/go/master/client_internal_test.go
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import (
-	"fmt"
-	"net"
-	"net/http"
-	"net/rpc"
-	"os"
-	"strconv"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/connection"
-	"github.com/PaddlePaddle/recordio"
-)
-
-const (
-	totalTask    = 20
-	chunkPerTask = 10
-)
-
-func TestGetFinishTask(t *testing.T) {
-	const path = "/tmp/master_client_test_0"
-
-	l, err := net.Listen("tcp", ":0")
-	if err != nil {
-		panic(err)
-	}
-
-	ss := strings.Split(l.Addr().String(), ":")
-	p, err := strconv.Atoi(ss[len(ss)-1])
-	if err != nil {
-		panic(err)
-	}
-	go func(l net.Listener) {
-		s, sErr := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
-		if sErr != nil {
-			panic(sErr)
-		}
-
-		server := rpc.NewServer()
-		sErr = server.Register(s)
-		if sErr != nil {
-			panic(sErr)
-		}
-
-		mux := http.NewServeMux()
-		mux.Handle(rpc.DefaultRPCPath, server)
-		sErr = http.Serve(l, mux)
-		if sErr != nil {
-			panic(sErr)
-		}
-	}(l)
-
-	f, err := os.Create(path)
-	if err != nil {
-		panic(err)
-	}
-
-	for i := 0; i < totalTask*chunkPerTask; i++ {
-		w := recordio.NewWriter(f, -1, -1)
-		_, err = w.Write(nil)
-		if err != nil {
-			panic(err)
-		}
-
-		// call Close to force RecordIO writing a chunk.
-		err = w.Close()
-		if err != nil {
-			panic(err)
-		}
-	}
-	err = f.Close()
-	if err != nil {
-		panic(err)
-	}
-
-	// Manually intialize client to avoid calling c.getRecords()
-	c := &Client{}
-	c.conn = connection.New()
-	addr := fmt.Sprintf(":%d", p)
-	ch := make(chan string, 1)
-	ch <- addr
-	go c.monitorMaster(ch)
-
-	err = c.SetDataset([]string{path})
-	if err != nil {
-		panic(err)
-	}
-
-	checkOnePass := func(i int) {
-		var tasks []Task
-		for idx := 0; idx < totalTask; idx++ {
-			task, cErr := c.getTask(i)
-			if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
-				t.Fatalf("error: %v, pass: %d\n", cErr, i)
-			}
-			tasks = append(tasks, task)
-		}
-
-		// getting task before task finishes should return error
-		_, cErr := c.getTask(i)
-		if cErr == nil {
-			t.Fatalf("Should get error, pass: %d\n", i)
-		}
-
-		cErr = c.taskFinished(tasks[0].Meta.ID)
-		if cErr != nil {
-			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
-		}
-		// call taskFailed once won't put the task to failed queue, just ensure
-		// the call
-		cErr = c.taskFailed(tasks[0].Meta)
-		if cErr != nil {
-			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
-		}
-
-		tasks = tasks[1:]
-		_, cErr = c.getTask(i)
-		if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
-			t.Fatalf("Should be ErrNoMoreAvailable or ErrPassAfter: %s", cErr)
-		}
-
-		for _, task := range tasks {
-			cErr = c.taskFinished(task.Meta.ID)
-			if cErr != nil {
-				t.Fatal(cErr)
-			}
-		}
-	}
-
-	for i := 0; i < 10; i++ {
-		// init pass data
-		c.StartGetRecords(i)
-		checkOnePass(i)
-	}
-}
diff --git a/go/master/client_test.go b/go/master/client_test.go
deleted file mode 100644
index 01ecad2de..000000000
--- a/go/master/client_test.go
+++ /dev/null
@@ -1,150 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master_test
-
-import (
-	"fmt"
-	"net"
-	"net/http"
-	"net/rpc"
-	"os"
-	"runtime"
-	"strconv"
-	"strings"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/recordio"
-)
-
-// tool function for testing output goroutine ids
-func goid() int {
-	var buf [64]byte
-	n := runtime.Stack(buf[:], false)
-	idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0]
-	id, err := strconv.Atoi(idField)
-	if err != nil {
-		panic(fmt.Sprintf("cannot get goroutine id: %v", err))
-	}
-	return id
-}
-
-func TestNextRecord(t *testing.T) {
-	const (
-		path  = "/tmp/master_client_TestFull"
-		total = 50
-	)
-	l, err := net.Listen("tcp", ":0")
-	if err != nil {
-		panic(err)
-	}
-
-	ss := strings.Split(l.Addr().String(), ":")
-	p, err := strconv.Atoi(ss[len(ss)-1])
-	if err != nil {
-		panic(err)
-	}
-	go func(l net.Listener) {
-		s, err := master.NewService(&master.InMemStore{}, 1, time.Second*60, 1)
-		if err != nil {
-			panic(err)
-		}
-
-		server := rpc.NewServer()
-		err = server.Register(s)
-		if err != nil {
-			panic(err)
-		}
-
-		mux := http.NewServeMux()
-		mux.Handle(rpc.DefaultRPCPath, server)
-		err = http.Serve(l, mux)
-		if err != nil {
-			panic(err)
-		}
-	}(l)
-
-	f, err := os.Create(path)
-	if err != nil {
-		panic(err)
-	}
-
-	w := recordio.NewWriter(f, 1, -1)
-	for i := 0; i < total; i++ {
-		_, err = w.Write([]byte{byte(i)})
-		if err != nil {
-			panic(err)
-		}
-	}
-
-	err = w.Close()
-	if err != nil {
-		panic(err)
-	}
-
-	err = f.Close()
-	if err != nil {
-		panic(err)
-	}
-
-	// start several client to test task fetching
-	var wg sync.WaitGroup
-	for i := 0; i < 4; i++ {
-		wg.Add(1)
-		// test for multiple concurrent clients
-		go func() {
-			defer wg.Done()
-			// each go-routine needs a single client connection instance
-			c, e := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(1))
-			if e != nil {
-				t.Fatal(e)
-			}
-			e = c.SetDataset([]string{path})
-			if e != nil {
-				panic(e)
-			}
-
-			// test for n passes
-			for pass := 0; pass < 10; pass++ {
-				c.StartGetRecords(pass)
-
-				received := make(map[byte]bool)
-				taskid := 0
-				for {
-					r, e := c.NextRecord()
-					if e != nil {
-						// ErrorPassAfter will wait, else break for next pass
-						if e.Error() == master.ErrPassBefore.Error() ||
-							e.Error() == master.ErrNoMoreAvailable.Error() {
-							break
-						}
-						t.Fatal(pass, taskid, "Read error:", e)
-					}
-					if len(r) != 1 {
-						t.Fatal(pass, taskid, "Length should be 1.", r)
-					}
-					if received[r[0]] {
-						t.Fatal(pass, taskid, "Received duplicate.", received, r)
-					}
-					taskid++
-					received[r[0]] = true
-				}
-			}
-		}()
-	}
-	wg.Wait()
-}
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
deleted file mode 100644
index 36fe61127..000000000
--- a/go/master/etcd_client.go
+++ /dev/null
@@ -1,201 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import (
-	"context"
-	"time"
-
-	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/inconshreveable/log15"
-)
-
-const (
-	// DefaultLockPath is the default etcd master lock path.
-	DefaultLockPath = "/master/lock"
-	// DefaultStatePath is the default etcd key for master state.
-	DefaultStatePath = "/master/state"
-	// DefaultAddrPath is the default etcd key for master address.
-	DefaultAddrPath = "/master/addr"
-)
-
-// EtcdClient is the etcd client that the master uses for fault
-// tolerance and service registry.
-type EtcdClient struct {
-	lockPath  string
-	statePath string
-	client    *clientv3.Client
-	lock      *concurrency.Mutex
-	sess      *concurrency.Session
-}
-
-// NewEtcdClient creates a new EtcdClient.
-func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
-	log.Debug("Connecting to etcd", log.Ctx{"endpoint": endpoints})
-	cli, err := clientv3.New(clientv3.Config{
-		Endpoints:   endpoints,
-		DialTimeout: dialTimeout,
-	})
-	if err != nil {
-		return nil, err
-	}
-
-	sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
-	if err != nil {
-		return nil, err
-	}
-
-	lock := concurrency.NewMutex(sess, lockPath)
-	// It's fine for the lock to get stuck, in this case we have
-	// multiple master servers running (only configured to have
-	// one master running, but split-brain problem may cause
-	// multiple master servers running), and the cluster management
-	// software will kill one of them.
-	log.Info("Trying to acquire lock.", log.Ctx{"path": lockPath})
-	err = lock.Lock(context.TODO())
-	if err != nil {
-		return nil, err
-	}
-	log.Info("Successfully acquired lock at %s.", log.Ctx{"path": lockPath})
-
-	put := clientv3.OpPut(addrPath, addr)
-	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
-	if err != nil {
-		return nil, err
-	}
-
-	if !resp.Succeeded {
-		log.Crit("No longer owns the master lock. Exiting.")
-		panic("No longer owns the master lock. Exiting.")
-	}
-
-	e := &EtcdClient{
-		lockPath:  lockPath,
-		statePath: statePath,
-		client:    cli,
-		lock:      lock,
-		sess:      sess,
-	}
-
-	return e, nil
-}
-
-// Save saves the state into the etcd.
-func (e *EtcdClient) Save(state []byte) error {
-	ctx := context.TODO()
-	put := clientv3.OpPut(e.statePath, string(state))
-	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
-	if err != nil {
-		return err
-	}
-
-	if !resp.Succeeded {
-		log.Error("No longer owns the lock, trying to lock again")
-		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-		err := e.lock.Lock(ctx)
-		cancel()
-		if err != nil {
-			// We lost the master lock and can not acquire
-			// it back, it means some other master is
-			// already started. We don't want cluster
-			// management system to kill the master server
-			// who is holding the lock and running
-			// correctly. So the most feasible solution is
-			// to kill current master server. The current
-			// state is not saved, but the trainer's RPC
-			// call will fail, so the trainer will retry.
-			log.Crit("Could not acquire the lock at %s: %v. Exiting.", log.Ctx{"path": e.lockPath, "error": err})
-			panic("Could not acquire the lock at %s: %v. Exiting.")
-		}
-		log.Info("Successfully acquired lock at %s.", e.lockPath)
-		return e.Save(state)
-	}
-
-	return nil
-}
-
-// Load loads the state from etcd.
-func (e *EtcdClient) Load() ([]byte, error) {
-	ctx := context.TODO()
-	get := clientv3.OpGet(e.statePath)
-
-	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
-	if err != nil {
-		return nil, err
-	}
-
-	if !resp.Succeeded {
-		log.Error("No longer owns the lock, trying to lock and load again.")
-		err = e.lock.Lock(context.Background())
-		if err != nil {
-			return nil, err
-		}
-
-		return e.Load()
-	}
-
-	kvs := resp.Responses[0].GetResponseRange().Kvs
-	if len(kvs) == 0 {
-		// No state exists
-		return nil, nil
-	}
-
-	state := kvs[0].Value
-	return state, nil
-}
-
-// Shutdown shuts down the etcd client gracefully.
-func (e *EtcdClient) Shutdown() error {
-	err := e.sess.Close()
-	newErr := e.client.Close()
-	if newErr != nil {
-		if err == nil {
-			err = newErr
-		} else {
-			log.Error("shutdown error", log.Ctx{"error": newErr})
-		}
-	}
-
-	return err
-}
-
-// GetKey gets the value by the specify key.
-func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	resp, err := c.Get(ctx, key)
-	cancel()
-	if err != nil {
-		return "", err
-	}
-	kvs := resp.Kvs
-	if len(kvs) == 0 {
-		return "", nil
-	}
-	v := kvs[0].Value
-	return string(v), nil
-}
-
-// watchKey watches the specify key and send to valChan if there is some event.
-func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
-	rch := c.Watch(context.Background(), key)
-	for wresp := range rch {
-		for _, ev := range wresp.Events {
-			// if received event is DELETE, the value will be an empty string
-			log.Info("received event.", log.Ctx{"type": ev.Type, "key": ev.Kv.Key, "value": ev.Kv.Value})
-			valChan <- string(ev.Kv.Value)
-		}
-	}
-}
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
deleted file mode 100644
index 33b471431..000000000
--- a/go/master/inmem_store.go
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import "sync"
-
-// InMemStore is an in memory implementation of Store interface.
-//
-// It does not tolerate the fault that causes the program to crash.
-type InMemStore struct {
-	mu  sync.Mutex
-	buf []byte
-}
-
-// Save saves the state into the in-memory store.
-func (m *InMemStore) Save(state []byte) error {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	m.buf = state
-	return nil
-}
-
-// Load loads the state from the in-memory store.
-func (m *InMemStore) Load() ([]byte, error) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	return m.buf, nil
-}
-
-// Shutdown shuts down the in mem store.
-func (m *InMemStore) Shutdown() error {
-	return nil
-}
diff --git a/go/master/service.go b/go/master/service.go
deleted file mode 100644
index 39f746e52..000000000
--- a/go/master/service.go
+++ /dev/null
@@ -1,510 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import (
-	"bytes"
-	"compress/gzip"
-	"encoding/gob"
-	"errors"
-	"math/rand"
-	"os"
-	"path/filepath"
-	"sync"
-	"time"
-
-	log "github.com/inconshreveable/log15"
-
-	"github.com/PaddlePaddle/recordio"
-)
-
-const (
-	dialTimeout = 5 * time.Second
-)
-
-// ErrAllTaskFailed occur when tasks are in done or failed state.
-var ErrAllTaskFailed = errors.New("all task finished")
-
-// ErrNoMoreAvailable occur when no task in todo and yet not all done or fail.
-var ErrNoMoreAvailable = errors.New("no more available task")
-
-// ErrPassBefore client side pass number does not match with master counter.
-var ErrPassBefore = errors.New("pass number smaller than master")
-
-// ErrPassAfter client side pass number does not match with master counter.
-var ErrPassAfter = errors.New("pass number larger than master")
-
-// Store is the interface for save and load the master state.
-type Store interface {
-	Save([]byte) error
-	Load() ([]byte, error)
-	Shutdown() error
-}
-
-// Chunk is a chunk of data consisted of several data instances.
-type Chunk struct {
-	Path  string
-	Index recordio.Index // chunk index
-}
-
-// TaskMeta is a struct which stores task's meta info.
-type TaskMeta struct {
-	ID    int
-	Epoch int
-}
-
-// Task is the basic unit of data instances assigned to trainers.
-type Task struct {
-	Meta   TaskMeta
-	Chunks []Chunk
-}
-
-type taskEntry struct {
-	Task Task
-	// A task fails if it's timeout or trainer reports it exits unnormally.
-	NumFailure int
-}
-
-type masterState struct {
-	Todo    []taskEntry
-	Pending map[int]taskEntry // map from task ID to task entry
-	Done    []taskEntry
-	Failed  []taskEntry
-	CurPass int
-}
-
-// Service is the master server service.
-type Service struct {
-	chunksPerTask int
-	timeoutDur    time.Duration
-	failureMax    int
-	store         Store
-
-	ready    chan struct{}
-	initDone bool
-
-	mu sync.Mutex
-	// State to be persisted to snapshot.
-	state masterState
-	// The trainer that is currently saving model. This state is
-	// transient, does not need to be persisted to snapshot.
-	savingTrainer string
-}
-
-func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
-	// generate uniq id across job using nanosecond + randint + counter
-	// FIXME(typhoonzero): this is a workaround, use uuid
-	randStart := rand.Int()
-	counter := 0
-	timestamp := time.Now().Nanosecond()
-	id := timestamp + randStart + counter
-	if chunksPerTask <= 0 {
-		chunksPerTask = 1
-	}
-
-	var result []taskEntry
-	var cur taskEntry
-	for i, c := range chunks {
-		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
-			cur.Task.Meta.ID = id
-			counter++
-			id = timestamp + randStart + counter
-			result = append(result, cur)
-			cur.Task.Chunks = nil
-		}
-
-		cur.Task.Chunks = append(cur.Task.Chunks, c)
-	}
-
-	if len(cur.Task.Chunks) > 0 {
-		cur.Task.Meta.ID = id
-		result = append(result, cur)
-	}
-
-	return result
-}
-
-// NewService creates a new service.
-func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
-	s := &Service{}
-	s.chunksPerTask = chunksPerTask
-	s.timeoutDur = timeoutDur
-	s.failureMax = failureMax
-	s.state = masterState{}
-	s.state.Pending = make(map[int]taskEntry)
-	s.ready = make(chan struct{})
-	s.store = store
-	recovered, err := s.recover()
-	if err != nil {
-		return nil, err
-	}
-
-	if recovered {
-		// Recovered. Now the state is already initialized,
-		// and the master is ready.
-		s.initDone = true
-		close(s.ready)
-		log.Info("Master recovered from saved state.")
-	}
-
-	return s, nil
-}
-
-// recover recovers service state from etcd.
-func (s *Service) recover() (bool, error) {
-	state, err := s.store.Load()
-	if err != nil {
-		return false, err
-	}
-
-	if state == nil {
-		log.Info("No state exists, not recovered.")
-		return false, nil
-	}
-
-	log.Info("Loaded snapshot.", log.Ctx{"size": len(state)})
-	gr, err := gzip.NewReader(bytes.NewReader(state))
-	if err != nil {
-		return false, err
-	}
-
-	dec := gob.NewDecoder(gr)
-	var tqs masterState
-	err = dec.Decode(&tqs)
-	if err != nil {
-		return false, err
-	}
-
-	err = gr.Close()
-	if err != nil {
-		// Only close failed, recover actually succeed, so
-		// just log error.
-		log.Error("error close recover file.", log.Ctx{"error": err})
-	}
-
-	s.state = tqs
-	log.Info("Master recovered from snapshot, scheduling pending task timeout check.", s.logCtx())
-	for _, t := range s.state.Pending {
-		time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
-	}
-
-	return true, nil
-}
-
-// snapshot *must* be called with s.mu being held.
-func (s *Service) snapshot() error {
-	// TODO(helin): etcd request has a size limit, so the snapshot
-	// size is limited by the max request size. We should either
-	// divide the snapshot into smaller chunks and save under
-	// different keys, or configure the request size to be big
-	// enough:
-	// https://github.com/coreos/etcd/blob/2f84f3d8d8ed8f9537ab6ffa44a3a1c7eddfa9b1/embed/config.go#L44
-	var buf bytes.Buffer
-	gw := gzip.NewWriter(&buf)
-	enc := gob.NewEncoder(gw)
-	err := enc.Encode(s.state)
-	if err != nil {
-		return err
-	}
-	err = gw.Close()
-	if err != nil {
-		return err
-	}
-
-	state := buf.Bytes()
-	log.Info("Saving snapshot.", log.Ctx{"size bytes": len(state)})
-	return s.store.Save(state)
-}
-
-func readChunks(globPaths []string) ([]Chunk, error) {
-	var chunks []Chunk
-	var paths []string
-
-	for _, s := range globPaths {
-		match, err := filepath.Glob(s)
-		if err != nil {
-			return nil, err
-		}
-		paths = append(paths, match...)
-	}
-
-	if len(paths) == 0 {
-		return nil, errors.New("no valid dataset specified")
-	}
-
-	for _, path := range paths {
-		f, err := os.Open(path)
-		if err != nil {
-			return nil, err
-		}
-
-		index, err := recordio.LoadIndex(f)
-		if err != nil {
-			return nil, err
-		}
-		err = f.Close()
-		if err != nil {
-			return nil, err
-		}
-
-		count := index.NumChunks()
-		log.Info("reading chunks.", log.Ctx{"path": path, "num chunks": count})
-		for i := 0; i < count; i++ {
-			chunk := Chunk{
-				Path:  path,
-				Index: *index.ChunkIndex(i),
-			}
-			chunks = append(chunks, chunk)
-		}
-	}
-
-	return chunks, nil
-}
-
-// SetDataset sets dataset to dispatch for the master server.
-//
-// SetDataset can be call multiple times. But only the first call will
-// be honored.
-func (s *Service) SetDataset(globPaths []string, _ *int) error {
-	if len(globPaths) == 0 {
-		return errors.New("no dataset specified")
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	if s.initDone {
-		// Already initialized. All trainer will call
-		// SetDataset, but we only handle the first one. Treat
-		// other calls as successful but do nothing.
-		return nil
-	}
-
-	chunks, err := readChunks(globPaths)
-	if err != nil {
-		return err
-	}
-
-	s.state.Todo = partition(chunks, s.chunksPerTask)
-
-	err = s.snapshot()
-	if err != nil {
-		log.Error("snapshot error", log.Ctx{"error": err})
-		return err
-	}
-	close(s.ready)
-	s.initDone = true
-	return nil
-}
-
-// processFailedTask retry s.failureMax times for failed task.
-// return true if all task are done or failed.
-func (s *Service) processFailedTask(t taskEntry, epoch int) {
-	if t.Task.Meta.Epoch != epoch {
-		// new epoch, task launched after the
-		// schedule of this timeout check or failed status report.
-		return
-	}
-
-	defer func() {
-		err := s.snapshot()
-		if err != nil {
-			log.Error("snapshot error", log.Ctx{"error": err})
-		}
-	}()
-
-	delete(s.state.Pending, t.Task.Meta.ID)
-
-	t.NumFailure++
-	if t.NumFailure > s.failureMax {
-		log.Warn("Task failed to many times, discard.", log.Ctx{"task": t.Task, "num failed": t.NumFailure})
-		s.state.Failed = append(s.state.Failed, t)
-		return
-	}
-
-	log.Warn("Task failed, re-dispatch.", log.Ctx{"task": t.Task, "num failed": t.NumFailure})
-	s.state.Todo = append(s.state.Todo, t)
-	return
-}
-
-func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
-	return func() {
-		s.mu.Lock()
-		defer s.mu.Unlock()
-
-		t, ok := s.state.Pending[taskID]
-		if !ok {
-			return
-		}
-
-		s.processFailedTask(t, epoch)
-	}
-}
-
-// must be called with lock held.
-func (s *Service) logCtx() log.Ctx {
-	return log.Ctx{
-		"todoLen":    len(s.state.Todo),
-		"pendingLen": len(s.state.Pending),
-		"doneLen":    len(s.state.Done),
-		"failedLen":  len(s.state.Failed),
-		"curPass":    s.state.CurPass,
-	}
-}
-
-// GetTask gets a new task from the service.
-// passID is the client side pass count
-func (s *Service) GetTask(passID int, task *Task) error {
-	select {
-	case <-s.ready:
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	if passID < s.state.CurPass {
-		return ErrPassBefore
-	}
-	if passID > s.state.CurPass {
-		// Client may get run to pass after master when one client faster than the
-		// other
-		return ErrPassAfter
-	}
-
-	if len(s.state.Todo) == 0 {
-		if len(s.state.Done) == 0 && len(s.state.Pending) == 0 {
-			log.Warn("All tasks failed, may start next pass", s.logCtx())
-			return ErrAllTaskFailed
-		}
-		log.Warn("No more available task.", s.logCtx())
-		return ErrNoMoreAvailable
-	}
-
-	t := s.state.Todo[0]
-	t.Task.Meta.Epoch++
-	s.state.Todo = s.state.Todo[1:]
-	s.state.Pending[t.Task.Meta.ID] = t
-	err := s.snapshot()
-	if err != nil {
-		return err
-	}
-
-	*task = t.Task
-	ctx := s.logCtx()
-	ctx["task meta"] = t.Task.Meta
-	log.Info("Task dispatched.", ctx)
-	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
-	return nil
-}
-
-// TaskFinished tell the service that a task is finished.
-func (s *Service) TaskFinished(taskID int, dummy *int) error {
-	select {
-	case <-s.ready:
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	t, ok := s.state.Pending[taskID]
-	if !ok {
-		ctx := s.logCtx()
-		ctx["task id"] = taskID
-		log.Warn("Pending task not found.", ctx)
-		return nil
-	}
-
-	// task finished, reset timeout
-	t.NumFailure = 0
-	s.state.Done = append(s.state.Done, t)
-	delete(s.state.Pending, taskID)
-
-	ctx := s.logCtx()
-	ctx["task id"] = taskID
-	log.Info("Task finished.", ctx)
-	if len(s.state.Todo) == 0 && len(s.state.Pending) == 0 {
-		// increase master side pass count if all tasks finished
-		s.state.CurPass++
-		s.state.Todo = append(s.state.Done, s.state.Failed...)
-		s.state.Done = []taskEntry{}
-		// TODO(typhoonzero): deal with failed tasks
-		s.state.Failed = []taskEntry{}
-		ctx := s.logCtx()
-		ctx["new pass"] = s.state.CurPass
-		log.Warn("all task finished, add new pass data.", ctx)
-	}
-
-	err := s.snapshot()
-	if err != nil {
-		log.Error("snapshot error", log.Ctx{"error": err})
-	}
-	return err
-}
-
-// TaskFailed tells the service that a task is failed.
-func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
-	select {
-	case <-s.ready:
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	t, ok := s.state.Pending[meta.ID]
-	if !ok {
-		log.Warn("TaskFailed:Pending task not found.", log.Ctx{"task": t.Task.Meta})
-		return nil
-	}
-
-	s.processFailedTask(t, meta.Epoch)
-	return nil
-}
-
-// SaveModelRequest is the request for saving model
-type SaveModelRequest struct {
-	TrainerID string
-	BlockDur  time.Duration
-}
-
-// RequestSaveModel requests the master server to approve the caller
-// to save the model.
-func (s *Service) RequestSaveModel(req SaveModelRequest, need *bool) error {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	if req.TrainerID == "" {
-		return errors.New("trainer id is empty")
-	}
-
-	if s.savingTrainer == "" {
-		*need = true
-	} else {
-		if req.TrainerID == s.savingTrainer {
-			// save trainer asked to save model again
-			*need = true
-		} else {
-			*need = false
-		}
-	}
-
-	if *need {
-		s.savingTrainer = req.TrainerID
-		time.AfterFunc(req.BlockDur, func() {
-			s.mu.Lock()
-			s.savingTrainer = ""
-			s.mu.Unlock()
-		})
-	}
-
-	return nil
-}
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
deleted file mode 100644
index dd22f3d54..000000000
--- a/go/master/service_internal_test.go
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package master
-
-import "testing"
-
-func TestPartitionCount(t *testing.T) {
-	cs := make([]Chunk, 100)
-	ts := partition(cs, 5)
-	if len(ts) != 20 {
-		t.Error(len(ts))
-	}
-
-	cs = make([]Chunk, 101)
-	ts = partition(cs, 5)
-	if len(ts) != 21 {
-		t.Error(len(ts))
-	}
-
-	ts = partition(cs, 1)
-	if len(ts) != 101 {
-		t.Error(len(ts))
-	}
-
-	ts = partition(cs, 0)
-	if len(ts) != 101 {
-		t.Error(len(ts))
-	}
-}
-
-func TestPartionIndex(t *testing.T) {
-	cs := make([]Chunk, 100)
-	ts := partition(cs, 20)
-	for i := range ts {
-		// test auto increament ids
-		if i > 0 && ts[i].Task.Meta.ID != ts[i-1].Task.Meta.ID+1 {
-			t.Error(ts[i], i)
-		}
-	}
-}
diff --git a/go/master/service_test.go b/go/master/service_test.go
deleted file mode 100644
index 2d00c22d6..000000000
--- a/go/master/service_test.go
+++ /dev/null
@@ -1,72 +0,0 @@
-package master_test
-
-import (
-	"io/ioutil"
-	"net/url"
-	"os"
-	"strings"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/embed"
-	"github.com/stretchr/testify/assert"
-)
-
-func TestNewServiceWithEtcd(t *testing.T) {
-	// setup an embed etcd server
-	etcdDir, err := ioutil.TempDir("", "")
-	if err != nil {
-		t.Fatal(err)
-	}
-	cfg := embed.NewConfig()
-	lpurl, _ := url.Parse("http://localhost:0")
-	lcurl, _ := url.Parse("http://localhost:0")
-	cfg.LPUrls = []url.URL{*lpurl}
-	cfg.LCUrls = []url.URL{*lcurl}
-	cfg.Dir = etcdDir
-	e, err := embed.StartEtcd(cfg)
-	if err != nil {
-		t.Fatal(err)
-	}
-	defer func() {
-		e.Close()
-		if err := os.RemoveAll(etcdDir); err != nil {
-			t.Fatal(err)
-		}
-	}()
-
-	<-e.Server.ReadyNotify()
-
-	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
-	endpoint := "127.0.0.1:" + port
-
-	ep := []string{endpoint}
-	masterAddr := "127.0.0.1:3306"
-	store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	_, err = master.NewService(store, 10, 10, 3)
-	if err != nil {
-		t.Fatal(err)
-	}
-	cli, err := clientv3.New(clientv3.Config{
-		Endpoints:   ep,
-		DialTimeout: 3 * time.Second,
-	})
-	if err != nil {
-		t.Fatal(err)
-	}
-	v, err := master.GetKey(cli, master.DefaultAddrPath, 3*time.Second)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if err := cli.Close(); err != nil {
-		t.Fatal(err)
-	}
-	// test master process registry itself into etcd server.
-	assert.Equal(t, masterAddr, v, "master process should registry itself into etcd server.")
-}
diff --git a/go/proto/.gitignore b/go/proto/.gitignore
deleted file mode 100644
index 5e7d2734c..000000000
--- a/go/proto/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-# Ignore everything in this directory
-*
-# Except this file
-!.gitignore
diff --git a/go/pserver/CMakeLists.txt b/go/pserver/CMakeLists.txt
deleted file mode 100644
index 32f3b2bab..000000000
--- a/go/pserver/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-if(WITH_TESTING)
-  go_test(pserver_test DEPS paddle_go_optimizer gen_proto_go)
-endif()
diff --git a/go/pserver/client/CMakeLists.txt b/go/pserver/client/CMakeLists.txt
deleted file mode 100644
index 1d6f45a66..000000000
--- a/go/pserver/client/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-if(WITH_TESTING)
-  go_test(pserver_client_test DEPS paddle_go_optimizer)
-endif()
diff --git a/go/pserver/client/c/.gitignore b/go/pserver/client/c/.gitignore
deleted file mode 100644
index 4bf05c853..000000000
--- a/go/pserver/client/c/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-libpaddle_go_optimizer.a
diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
deleted file mode 100644
index 78776219d..000000000
--- a/go/pserver/client/c/CMakeLists.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
-target_link_libraries(paddle_go_optimizer stdc++ m)
-
-# Copy library to the required place.
-# See: go/pserver/optimizer.go:
-# // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
-add_custom_command(TARGET paddle_go_optimizer POST_BUILD
-  COMMAND cp "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_go_optimizer.a" "${CMAKE_CURRENT_SOURCE_DIR}"
-  )
-
-go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
-if(WITH_TESTING)
-  # FIXME: this test requires pserver which is not managed by the test
-  # we need some kind of e2e testing machanism.
-  # add_subdirectory(test)
-endif()
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
deleted file mode 100644
index cddc28e46..000000000
--- a/go/pserver/client/c/cclient.go
+++ /dev/null
@@ -1,300 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package main
-
-/*
-#include <string.h>
-typedef enum {
-  PADDLE_ELEMENT_TYPE_INT32   = 0,
-  PADDLE_ELEMENT_TYPE_UINT32  = 1,
-  PADDLE_ELEMENT_TYPE_INT64   = 2,
-  PADDLE_ELEMENT_TYPE_UINT64  = 3,
-  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
-  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
-} paddle_element_type;
-
-typedef struct {
-  char*               name;
-  paddle_element_type element_type;
-  unsigned char*      content;
-  int                 content_len;
-} paddle_parameter, paddle_gradient;
-
-typedef int paddle_pserver_client;
-#define PSERVER_ERROR -1
-#define PSERVER_OK 0
-*/
-import "C"
-
-import (
-	"strings"
-	"sync"
-	"unsafe"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	"github.com/PaddlePaddle/Paddle/go/pserver/client"
-	log "github.com/inconshreveable/log15"
-)
-
-func init() {
-	log.Root().SetHandler(
-		log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)),
-	)
-}
-
-var mu sync.Mutex
-var handleMap = make(map[C.paddle_pserver_client]*client.Client)
-var curHandle C.paddle_pserver_client
-
-func add(c *client.Client) C.paddle_pserver_client {
-	mu.Lock()
-	defer mu.Unlock()
-	cli := curHandle
-	curHandle++
-	handleMap[cli] = c
-	return cli
-}
-
-func get(client C.paddle_pserver_client) *client.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	return handleMap[client]
-}
-
-func remove(client C.paddle_pserver_client) *client.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	h := handleMap[client]
-	delete(handleMap, client)
-	return h
-}
-
-func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nil {
-		return nil
-	}
-
-	// create a Go clice backed by a C array, reference:
-	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-	//
-	// Go garbage collector will not interact with this data, need
-	// to be freed properly.
-	return (*[1 << 30]byte)(p)[:len:len]
-}
-
-type selector bool
-
-func (s selector) Select() (bool, error) {
-	return bool(s), nil
-}
-
-func (s selector) Done() error {
-	return nil
-}
-
-type lister []client.Server
-
-func (l lister) List() []client.Server {
-	return l
-}
-
-//export paddle_new_pserver_client
-func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
-	a := C.GoString(addrs)
-	as := strings.Split(a, ",")
-	servers := make([]client.Server, len(as))
-	for i := range as {
-		servers[i].Index = i
-		servers[i].Addr = as[i]
-	}
-	c := client.NewClient(lister(servers), len(as), selector(selected != 0))
-	return add(c)
-}
-
-//export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcdEndpoints *C.char) C.paddle_pserver_client {
-	addr := C.GoString(etcdEndpoints)
-	etcdClient := client.NewEtcd(addr)
-	c := client.NewClient(etcdClient, etcdClient.Desired(), etcdClient)
-	return add(c)
-}
-
-//export paddle_pserver_client_release
-func paddle_pserver_client_release(client C.paddle_pserver_client) {
-	remove(client)
-}
-
-// paddle_begin_init_params tells trainer if it needs to init the
-// parameters.
-//
-// returns 1 if the trainer needs to init the parameters. 0 if the
-// trainer does not need to init the parameters.
-//
-//export paddle_begin_init_params
-func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
-	c := get(client)
-	selected, err := c.BeginInitParams()
-	if err != nil {
-		panic(err)
-	}
-
-	if selected {
-		return 1
-	}
-	return 0
-}
-
-//export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
-	et := pserver.ElementType(param.element_type)
-	name := C.GoString(param.name)
-	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
-	pc := pserver.ParameterWithConfig{
-		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(paramConfig, int(configLen)),
-	}
-	c := get(client)
-	err := c.InitParam(pc)
-
-	if err != nil {
-		if err.Error() == pserver.AlreadyInitialized {
-			log.Warn(
-				"parameter already initialized, treat paddle_init_param as successful.",
-				log.Ctx{"parameter": name},
-			)
-			return C.PSERVER_OK
-		}
-		log.Error("error init param", log.Ctx{"error": err})
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
-//export paddle_finish_init_params
-func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
-	c := get(client)
-	err := c.FinishInitParams()
-	if err != nil {
-		if err.Error() == pserver.AlreadyInitialized {
-			log.Warn("parameters already initialized, treat paddle_finish_init_params as successful.")
-			return C.PSERVER_OK
-		}
-
-		log.Error("error finish init params", log.Ctx{"error": err})
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
-//export paddle_send_grads
-func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient, total C.int) C.int {
-	var gs []pserver.Gradient
-	for i := 0; i < int(total); i++ {
-		grad := *(**C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
-		et := pserver.ElementType(grad.element_type)
-		name := C.GoString(grad.name)
-		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
-		gs = append(gs, pserver.Gradient{Name: name, ElementType: et, Content: content})
-	}
-
-	c := get(client)
-	err := c.SendGrads(gs)
-	if err != nil {
-		log.Error("error send grads", log.Ctx{"error": err})
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
-//export paddle_get_params
-func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, total C.int) C.int {
-	var ns []string
-	for i := 0; i < int(total); i++ {
-		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		ns = append(ns, C.GoString(param.name))
-	}
-	c := get(client)
-	ps, err := c.GetParams(ns)
-	if err != nil {
-		log.Error("error get params", log.Ctx{"error": err})
-		return C.PSERVER_ERROR
-	}
-
-	if len(ps) != len(ns) {
-		pn := make([]string, len(ps))
-		for i, p := range ps {
-			pn[i] = p.Name
-		}
-		log.Error(
-			"pserver returned wrong number of parameters.",
-			log.Ctx{
-				"Requested": strings.Join(pn, ", "),
-				"Returned":  strings.Join(ns, ", "),
-			},
-		)
-		return C.PSERVER_ERROR
-	}
-
-	for i := range ps {
-		if ns[i] != ps[i].Name {
-			pn := make([]string, len(ps))
-			for i, p := range ps {
-				pn[i] = p.Name
-			}
-			log.Error(
-				"pserver returned wrong parameters, or not in requested order.",
-				log.Ctx{
-					"Requested": strings.Join(pn, ", "),
-					"Returned":  strings.Join(ns, ", "),
-				},
-			)
-			return C.PSERVER_ERROR
-		}
-	}
-
-	for i := 0; i < int(total); i++ {
-		p := ps[i]
-		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-
-		if unsafe.Pointer(param) == nil {
-			log.Error("must pre-allocate parameter.")
-			return C.PSERVER_ERROR
-		}
-
-		if unsafe.Pointer(param.content) != nil {
-			if int(param.content_len) != len(p.Content) {
-				log.Error(
-					"the pre-allocated content len does not match parameter content len.",
-					log.Ctx{
-						"Pre-allocated len": param.content_len,
-						"Returned len":      len(p.Content),
-					},
-				)
-				return C.PSERVER_ERROR
-			}
-		}
-
-		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
-		param.content_len = C.int(len(p.Content))
-		param.element_type = C.paddle_element_type(p.ElementType)
-	}
-
-	return C.PSERVER_OK
-}
-
-func main() {} // Required but ignored
diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
deleted file mode 100644
index 4500b1f28..000000000
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
diff --git a/go/pserver/client/c/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c
deleted file mode 100644
index 0116e42a0..000000000
--- a/go/pserver/client/c/test/test_cclient.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "libpaddle_pserver_cclient.h"
-
-// TODO(helin): Fix: gtest using cmake is not working, using this
-// hacky way for now.
-#define fail()                                          \
-  fprintf(stderr, "info: %s:%d: ", __FILE__, __LINE__); \
-  exit(-1);
-
-void sendGrads(paddle_pserver_client c) {
-  unsigned char grad_a[2000] = {2};
-  unsigned char grad_b[3000] = {3};
-  paddle_gradient grad1 = {
-      "param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000};
-  paddle_gradient grad2 = {
-      "param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000};
-  paddle_gradient *grads[2] = {&grad1, &grad2};
-  if (paddle_send_grads(c, grads, 2)) {
-    fail();
-  }
-}
-
-void getParams(paddle_pserver_client c) {
-  paddle_parameter param_a;
-  paddle_parameter param_b;
-  char name_a[] = "param_a";
-  char name_b[] = "param_b";
-  // Must pre-allocate the prameter content before calling paddle_get_params.
-  unsigned char content_a[2000] = {};
-  unsigned char content_b[3000] = {};
-  param_a.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-  param_a.name = name_a;
-  param_a.content = content_a;
-  param_a.content_len = 2000;
-  param_b.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-  param_b.name = name_b;
-  param_b.content = content_b;
-  param_b.content_len = 3000;
-
-  paddle_parameter *params[2] = {&param_a, &param_b};
-  if (paddle_get_params(c, params, 2)) {
-    fail();
-  }
-}
-
-int main() {
-  char addr[] = "localhost:3000";
-  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
-  char *config_proto;
-  size_t config_proto_len = 0;
-  ssize_t nread;
-  FILE *fp = fopen("testdata/optimizer.pb", "r");
-  if (!fp) {
-    fail();
-  }
-  while ((nread = getline(&config_proto, &config_proto_len, fp)) != -1) {
-    printf("%s", config_proto);
-  }
-  fclose(fp);
-retry:
-  if (paddle_begin_init_params(c)) {
-    paddle_parameter param;
-    char name_a[] = "param_a";
-    char name_b[] = "param_b";
-    unsigned char content_a[2000] = {1};
-    unsigned char content_b[3000] = {0};
-    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-    param.name = name_a;
-    param.content = content_a;
-    param.content_len = 2000;
-    int error =
-        paddle_init_param(c, param, (void *)config_proto, config_proto_len);
-    if (error != 0) {
-      goto retry;
-    }
-
-    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-    param.name = name_b;
-    param.content = content_b;
-    param.content_len = 3000;
-    error = paddle_init_param(c, param, (void *)config_proto, config_proto_len);
-    if (error != 0) {
-      goto retry;
-    }
-
-    error = paddle_finish_init_params(c);
-    if (error != 0) {
-      goto retry;
-    }
-  }
-
-  int i;
-  for (i = 0; i < 100; i++) {
-    sendGrads(c);
-    getParams(c);
-  }
-
-  return 0;
-}
diff --git a/go/pserver/client/c/test/test_mnist.py b/go/pserver/client/c/test/test_mnist.py
deleted file mode 100644
index 97f63aeb6..000000000
--- a/go/pserver/client/c/test/test_mnist.py
+++ /dev/null
@@ -1,145 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2 as paddle
-import gzip
-
-
-def softmax_regression(img):
-    predict = paddle.layer.fc(input=img,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def multilayer_perceptron(img):
-    # The first fully-connected layer
-    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
-    # The second fully-connected layer and the according activation function
-    hidden2 = paddle.layer.fc(input=hidden1,
-                              size=64,
-                              act=paddle.activation.Relu())
-    # The thrid fully-connected layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = paddle.layer.fc(input=hidden2,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def convolutional_neural_network(img):
-    # first conv layer
-    conv_pool_1 = paddle.networks.simple_img_conv_pool(
-        input=img,
-        filter_size=5,
-        num_filters=20,
-        num_channel=1,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-    # second conv layer
-    conv_pool_2 = paddle.networks.simple_img_conv_pool(
-        input=conv_pool_1,
-        filter_size=5,
-        num_filters=50,
-        num_channel=20,
-        pool_size=2,
-        pool_stride=2,
-        act=paddle.activation.Tanh())
-    # The first fully-connected layer
-    fc1 = paddle.layer.fc(input=conv_pool_2,
-                          size=128,
-                          act=paddle.activation.Tanh())
-    # The softmax layer, note that the hidden size should be 10,
-    # which is the number of unique digits
-    predict = paddle.layer.fc(input=fc1,
-                              size=10,
-                              act=paddle.activation.Softmax())
-    return predict
-
-
-def main():
-    paddle.init(use_gpu=False, trainer_count=1)
-
-    # define network topology
-    images = paddle.layer.data(
-        name='pixel', type=paddle.data_type.dense_vector(784))
-    label = paddle.layer.data(
-        name='label', type=paddle.data_type.integer_value(10))
-
-    # Here we can build the prediction network in different ways. Please
-    # choose one by uncomment corresponding line.
-    predict = softmax_regression(images)
-    #predict = multilayer_perceptron(images)
-    #predict = convolutional_neural_network(images)
-
-    cost = paddle.layer.classification_cost(input=predict, label=label)
-    parameters = paddle.parameters.create(cost)
-
-    optimizer = paddle.optimizer.Momentum(
-        learning_rate=0.1 / 128.0,
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
-
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 is_local=False,
-                                 pserver_spec="localhost:3000")
-
-    lists = []
-
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 1000 == 0:
-                print "Pass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
-
-        elif isinstance(event, paddle.event.EndPass):
-            result = trainer.test(reader=paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=128))
-            print "Test with Pass %d, Cost %f, %s\n" % (
-                event.pass_id, result.cost, result.metrics)
-            lists.append((event.pass_id, result.cost,
-                          result.metrics['classification_error_evaluator']))
-
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.mnist.train(), buf_size=8192),
-            batch_size=128),
-        event_handler=event_handler,
-        num_passes=100)
-
-    # find the best pass
-    best = sorted(lists, key=lambda list: float(list[1]))[0]
-    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
-    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
-
-    test_creator = paddle.dataset.mnist.test()
-    test_data = []
-    for item in test_creator():
-        test_data.append((item[0], ))
-        if len(test_data) == 100:
-            break
-
-    # output is a softmax layer. It returns probabilities.
-    # Shape should be (100, 10)
-    probs = paddle.infer(
-        output_layer=predict, parameters=parameters, input=test_data)
-    print probs.shape
-
-
-if __name__ == '__main__':
-    main()
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
deleted file mode 100644
index 2db5a0bf6..000000000
--- a/go/pserver/client/c/test/test_train.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2 as paddle
-import paddle.v2.dataset.uci_housing as uci_housing
-import paddle.v2.master as master
-import os
-import cPickle as pickle
-from paddle.v2.reader.creator import cloud_reader
-
-etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
-etcd_endpoints = "http://" + etcd_ip + ":2379"
-print "etcd endpoints: ", etcd_endpoints
-
-
-def main():
-    # init
-    paddle.init(use_gpu=False, trainer_count=1)
-
-    # network config
-    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
-    y_predict = paddle.layer.fc(input=x,
-                                param_attr=paddle.attr.Param(name='w'),
-                                size=1,
-                                act=paddle.activation.Linear(),
-                                bias_attr=paddle.attr.Param(name='b'))
-    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-    cost = paddle.layer.mse_cost(input=y_predict, label=y)
-
-    # create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # create optimizer of new remote updater to pserver
-    optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3)
-
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 is_local=False,
-                                 pserver_spec=etcd_endpoints,
-                                 use_etcd=True)
-
-    # event_handler to print training and testing info
-    def event_handler(event):
-        if isinstance(event, paddle.event.EndIteration):
-            # FIXME: for cloud data reader, pass number is managed by master
-            # should print the server side pass number
-            if event.batch_id % 100 == 0:
-                print "Pass %d, Batch %d, Cost %f" % (
-                    event.pass_id, event.batch_id, event.cost)
-
-        if isinstance(event, paddle.event.EndPass):
-            if (event.pass_id + 1) % 10 == 0:
-                result = trainer.test(
-                    reader=paddle.batch(
-                        uci_housing.test(), batch_size=2),
-                    feeding={'x': 0,
-                             'y': 1})
-                print "Test %d, %.2f" % (event.pass_id, result.cost)
-
-    # training
-    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
-    trainer.train(
-        reader=paddle.batch(
-            paddle.reader.shuffle(
-                cloud_reader(
-                    ["/pfs/dlnel/public/dataset/uci_housing/uci_housing*"],
-                    etcd_endpoints),
-                buf_size=500),
-            batch_size=2),
-        feeding={'x': 0,
-                 'y': 1},
-        event_handler=event_handler,
-        num_passes=30)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/go/pserver/client/c/test/testdata/optimizer.pb b/go/pserver/client/c/test/testdata/optimizer.pb
deleted file mode 100644
index 27dd3bc5f19e2964b4b674cff8860233cbdb445a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 50
kcmd;JloDUb$N&X9;j9CU3=s@ToSd^}g1}Dum25B;0LStS`2YX_

diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go
deleted file mode 100644
index 2a8f66a07..000000000
--- a/go/pserver/client/client.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package client
-
-import (
-	"errors"
-	"hash/fnv"
-	"sort"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/connection"
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	log "github.com/inconshreveable/log15"
-)
-
-// TODO(helin): add RPC call retry logic
-
-// Selector selects if the client should initialize parameters and
-// reports the initialization process done.
-type Selector interface {
-	// Select selects if the client should initialize parameter servers.
-	Select() (bool, error)
-	// Done indicates the initialization process is done.
-	Done() error
-}
-
-// Server is the identification of a parameter Server.
-type Server struct {
-	Index int
-	Addr  string
-}
-
-// Lister lists currently available parameter servers.
-type Lister interface {
-	List() []Server
-}
-
-// Client is the client to parameter servers.
-type Client struct {
-	sel      Selector
-	pservers []*connection.Conn
-}
-
-// NewClient creates a new client.
-func NewClient(l Lister, pserverNum int, sel Selector) *Client {
-	c := &Client{sel: sel}
-	c.pservers = make([]*connection.Conn, pserverNum)
-	for i := 0; i < pserverNum; i++ {
-		c.pservers[i] = connection.New()
-	}
-	go c.monitorPservers(l, pserverNum)
-	return c
-}
-
-// monitorPservers monitors pserver addresses, and updates connection
-// when the address changes.
-func (c *Client) monitorPservers(l Lister, pserverNum int) {
-	lastServers := make([]Server, pserverNum)
-	ticker := time.NewTicker(10 * time.Second)
-	monitor := func() {
-		curServers := make([]Server, pserverNum)
-		list := l.List()
-		for _, l := range list {
-			curServers[l.Index] = l
-		}
-
-		for i := range lastServers {
-			if lastServers[i].Addr == curServers[i].Addr {
-				continue
-			}
-
-			if curServers[i].Addr == "" {
-				err := c.pservers[i].Close()
-				if err != nil {
-					log.Error("error closing connection to pserver", log.Ctx{"error": err})
-				}
-
-				continue
-			}
-
-			err := c.pservers[i].Connect(curServers[i].Addr)
-			if err != nil {
-				log.Error("error connecting to pserver", log.Ctx{"error": err})
-
-				// connect to addr failed, set
-				// to last known addr in order
-				// to retry next time.
-				curServers[i].Addr = lastServers[i].Addr
-			}
-
-		}
-
-		lastServers = curServers
-	}
-
-	monitor()
-	for range ticker.C {
-		monitor()
-	}
-}
-
-// BeginInitParams begins to initialize parameters on parameter
-// servers.
-//
-// BeginInitParams will be called from multiple trainers, only one
-// trainer will be selected to initialize the parameters on parameter
-// servers. Other trainers will be blocked until the initialization is
-// done, and they need to get the initialized parameters from
-// parameter servers using GetParams.
-func (c *Client) BeginInitParams() (bool, error) {
-	return c.sel.Select()
-}
-
-// InitParam initializes the parameter on parameter servers.
-func (c *Client) InitParam(paramWithConfigs pserver.ParameterWithConfig) error {
-	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
-}
-
-// FinishInitParams tells parameter servers client has sent all
-// parameters to parameter servers as initialization.
-func (c *Client) FinishInitParams() error {
-	for _, p := range c.pservers {
-		err := p.Call("Service.FinishInitParams", 0, nil)
-		if err != nil {
-			return err
-		}
-	}
-	return c.sel.Done()
-}
-
-// SendGrads sends gradients to parameter servers for updating
-// parameters.
-func (c *Client) SendGrads(grads []pserver.Gradient) error {
-	if len(grads) == 0 {
-		return errors.New("no gradient received")
-	}
-	errCh := make(chan error, len(grads))
-	for _, g := range grads {
-		go func(g pserver.Gradient) {
-			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
-			errCh <- err
-		}(g)
-	}
-
-	recv := 0
-	for err := range errCh {
-		if err != nil {
-			return err
-		}
-
-		recv++
-		if recv == len(grads) {
-			break
-		}
-	}
-	return nil
-}
-
-type result struct {
-	idx   int
-	param pserver.Parameter
-	err   error
-}
-
-type results []result
-
-func (r results) Len() int {
-	return len(r)
-}
-
-func (r results) Less(i int, j int) bool {
-	return r[i].idx < r[j].idx
-}
-
-func (r results) Swap(i int, j int) {
-	r[i], r[j] = r[j], r[i]
-}
-
-// GetParams gets parameters from parameter servers.
-func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
-	rCh := make(chan result, len(names))
-
-	for idx, name := range names {
-		go func(name string, idx int) {
-			var parameter pserver.Parameter
-			err := c.pservers[c.partition(name)].Call("Service.GetParam", name, &parameter)
-			rCh <- result{idx: idx, param: parameter, err: err}
-		}(name, idx)
-	}
-
-	var rs results
-	recv := 0
-	for r := range rCh {
-		if r.err != nil {
-			return nil, r.err
-		}
-		rs = append(rs, r)
-
-		recv++
-		if recv == len(names) {
-			break
-		}
-	}
-	sort.Sort(rs)
-
-	ps := make([]pserver.Parameter, len(rs))
-	for i := range rs {
-		ps[i] = rs[i].param
-	}
-
-	return ps, nil
-}
-
-func strHash(s string) uint32 {
-	h := fnv.New32a()
-	_, _ = h.Write([]byte(s))
-	return h.Sum32()
-}
-
-// TODO(helin): now partition only select which parameter server to
-// send the entire parameter. We need to partition a parameter into
-// small blocks and send to different parameter servers.
-func (c *Client) partition(key string) int {
-	return int(strHash(key) % uint32(len(c.pservers)))
-}
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
deleted file mode 100644
index 3a067ff51..000000000
--- a/go/pserver/client/client_test.go
+++ /dev/null
@@ -1,268 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package client_test
-
-import (
-	"context"
-	"io/ioutil"
-	"math/rand"
-	"net"
-	"net/http"
-	"net/rpc"
-	"strconv"
-	"strings"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	"github.com/PaddlePaddle/Paddle/go/pserver/client"
-	"github.com/coreos/etcd/clientv3"
-	log "github.com/inconshreveable/log15"
-)
-
-const (
-	numPserver    = 10
-	etcdEndpoints = "127.0.0.1:2379"
-	timeout       = 2 * time.Second
-)
-
-var pserverClientPorts [numPserver]int
-
-// this function init pserver client and return their ports in an array.
-func initClient() [numPserver]int {
-	var ports [numPserver]int
-	for i := 0; i < numPserver; i++ {
-		l, err := net.Listen("tcp", ":0")
-		if err != nil {
-			panic(err)
-		}
-
-		ss := strings.Split(l.Addr().String(), ":")
-		p, err := strconv.Atoi(ss[len(ss)-1])
-		if err != nil {
-			panic(err)
-		}
-		ports[i] = p
-
-		go func(l net.Listener) {
-			var cp pserver.Checkpoint
-			s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-			if err != nil {
-				panic(err)
-			}
-			server := rpc.NewServer()
-			err = server.Register(s)
-			if err != nil {
-				panic(err)
-			}
-
-			mux := http.NewServeMux()
-			mux.Handle(rpc.DefaultRPCPath, server)
-			err = http.Serve(l, mux)
-			if err != nil {
-				panic(err)
-			}
-		}(l)
-	}
-	return ports
-}
-
-func initNativeClient() {
-	pserverClientPorts = initClient()
-}
-
-func initEtcdClient() {
-	client, err := clientv3.New(clientv3.Config{
-		Endpoints:   []string{etcdEndpoints},
-		DialTimeout: time.Second * time.Duration(1),
-	})
-	if err != nil {
-		log.Error("error init etcd client", log.Ctx{"error": err})
-	}
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	_, err = client.Delete(ctx, pserver.PsDesired)
-	if err != nil {
-		panic(err)
-	}
-
-	_, err = client.Delete(ctx, pserver.PsPath)
-	if err != nil {
-		panic(err)
-	}
-
-	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
-	if err != nil {
-		panic(err)
-	}
-
-	ports := initClient()
-	for i := 0; i < numPserver; i++ {
-		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
-		if err != nil {
-			panic(err)
-		}
-	}
-	cancel()
-	err = client.Close()
-	if err != nil {
-		panic(err)
-	}
-}
-
-type selector bool
-
-func (s selector) Select() (bool, error) {
-	return bool(s), nil
-}
-
-func (s selector) Done() error {
-	return nil
-}
-
-type lister []client.Server
-
-func (l lister) List() []client.Server {
-	return l
-}
-
-func testClient(t *testing.T, c *client.Client) {
-	selected, err := c.BeginInitParams()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if !selected {
-		t.Fatal("should be selected.")
-	}
-
-	const numParameter = 1000
-	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-
-	var wg sync.WaitGroup
-	for i := 0; i < numParameter; i++ {
-		wg.Add(1)
-		go func(i int) {
-			var p pserver.Parameter
-			p.Name = "p_" + strconv.Itoa(i)
-			p.ElementType = pserver.Float32
-			p.Content = make([]byte, (i+1)*100)
-			err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
-			if err != nil {
-				t.Fatal(err)
-			}
-			wg.Done()
-		}(i)
-	}
-	wg.Wait()
-
-	err = c.FinishInitParams()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var grads []pserver.Gradient
-	for i := 0; i < numParameter; i++ {
-		var g pserver.Gradient
-		g.Name = "p_" + strconv.Itoa(i)
-		g.ElementType = pserver.Float32
-		g.Content = make([]byte, (i+1)*100)
-		grads = append(grads, g)
-	}
-
-	const paramPerGroup = 10
-	const numGroups = numParameter / paramPerGroup
-
-	// shuffle send grads order
-	for i := range grads {
-		j := rand.Intn(i + 1)
-		grads[i], grads[j] = grads[j], grads[i]
-	}
-
-	for i := 0; i < numGroups; i++ {
-		var gs []pserver.Gradient
-		if i == numGroups-1 {
-			gs = grads[i*paramPerGroup:]
-		} else {
-			gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
-		}
-
-		wg.Add(1)
-		go func(gs []pserver.Gradient) {
-			err := c.SendGrads(gs)
-			if err != nil {
-				t.Fatal(err)
-			}
-			wg.Done()
-		}(gs)
-	}
-
-	names := make([]string, numParameter)
-	for i := 0; i < numParameter; i++ {
-		names[i] = "p_" + strconv.Itoa(i)
-	}
-
-	for i := 0; i < numGroups; i++ {
-		var ns []string
-		if i == numGroups-1 {
-			ns = names[i*paramPerGroup:]
-		} else {
-			ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
-		}
-
-		wg.Add(1)
-		go func(ns []string) {
-			params, err := c.GetParams(ns)
-			if err != nil {
-				t.Fatal(err)
-			}
-
-			if len(ns) != len(params) {
-				t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
-			}
-
-			for i := range params {
-				if ns[i] != params[i].Name {
-					t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
-				}
-			}
-			wg.Done()
-		}(ns)
-	}
-
-	wg.Wait()
-}
-
-func TestNativeClient(t *testing.T) {
-	initNativeClient()
-	servers := make([]client.Server, numPserver)
-	for i := 0; i < numPserver; i++ {
-		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
-	}
-	c1 := client.NewClient(lister(servers), len(servers), selector(true))
-	testClient(t, c1)
-}
-
-// EtcdClient is a disabled test, since we have not embedded etcd into
-// our test.
-func EtcdClient(t *testing.T) {
-	initEtcdClient()
-	etcdClient := client.NewEtcd(etcdEndpoints)
-	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
-	testClient(t, c2)
-}
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
deleted file mode 100644
index 3fb835a6e..000000000
--- a/go/pserver/client/etcd_client.go
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package client
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/inconshreveable/log15"
-)
-
-const (
-	defaultEtcdTimeout time.Duration = 5 * time.Second
-
-	initLockPath = "/init_ps/lock"
-	initDonePath = "/init_ps/done"
-	initDoneVal  = "1"
-)
-
-// Etcd is used by pserver client that is a part of trainer process.
-// TODO:
-// 1. add watcher to watch the change state of pservers.
-type Etcd struct {
-	client    *clientv3.Client
-	timeout   time.Duration
-	endpoints []string
-	lock      *concurrency.Mutex
-}
-
-// Desired read ps desired number from etcd.
-func (e *Etcd) Desired() int {
-	var psDesired int
-	for {
-		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-		resp, err := e.client.Get(ctx, pserver.PsDesired)
-		cancel()
-		if err != nil {
-			log.Error(
-				"Get ps dresire number failed! reconnecting...",
-				log.Ctx{"error": err},
-			)
-			time.Sleep(e.timeout)
-			continue
-		}
-
-		kvs := resp.Kvs
-		if len(kvs) == 0 {
-			log.Info("Waiting for ps desired registered ...")
-			time.Sleep(e.timeout)
-			continue
-		}
-
-		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
-		if err != nil {
-			log.Error("atoi failed", log.Ctx{"error": err})
-			time.Sleep(e.timeout)
-			continue
-		}
-
-		log.Debug("Got psDesired", log.Ctx{"psDesired": psDesired})
-		break
-	}
-	return psDesired
-}
-
-// List return the pserver list read from etcd.
-func (e *Etcd) List() []Server {
-	psDesired := e.Desired()
-
-	servers := make([]Server, psDesired)
-	for {
-		for i := 0; i < psDesired; i++ {
-			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-			psKey := pserver.PsPath + strconv.Itoa(i)
-			log.Debug("looking for pserver", log.Ctx{"ps key": psKey})
-			resp, err := e.client.Get(ctx, psKey)
-			cancel()
-			if err != nil {
-				log.Info(
-					"Get psKey error",
-					log.Ctx{"ps key": psKey, "error": err},
-				)
-				time.Sleep(e.timeout)
-				continue
-			}
-			kvs := resp.Kvs
-			if len(kvs) == 0 {
-				log.Info("Waiting for ps addr registered ...")
-				time.Sleep(e.timeout)
-				continue
-			}
-
-			psAddr := string(resp.Kvs[0].Value)
-			// TODO(Longfei) check the ps address
-			if psAddr == "" {
-				log.Info(
-					"Value under psKey is empty",
-					log.Ctx{"psKey": psKey},
-				)
-				time.Sleep(e.timeout)
-				continue
-			}
-			log.Debug(
-				"got psAddr given psKey",
-				log.Ctx{"psAddr": psAddr, "psKey": psKey},
-			)
-			servers[i].Index = i
-			servers[i].Addr = psAddr
-		}
-		break
-	}
-	return servers
-}
-
-// NewEtcd create a etcd client to return the state of pserver on etcd.
-func NewEtcd(endpoints string) *Etcd {
-	ep := strings.Split(endpoints, ",")
-	var cli *clientv3.Client
-	var err error
-	for {
-		cli, err = clientv3.New(clientv3.Config{
-			Endpoints:   ep,
-			DialTimeout: defaultEtcdTimeout,
-		})
-		if err != nil {
-			log.Error("Init etcd connection failed", log.Ctx{"error": err})
-			time.Sleep(defaultEtcdTimeout)
-			continue
-		}
-		break
-	}
-	log.Info("Connected to etcd endpoint", log.Ctx{"endpoint": endpoints})
-	client := &Etcd{
-		client:    cli,
-		timeout:   defaultEtcdTimeout,
-		endpoints: ep,
-	}
-	return client
-}
-
-// Select indicates if the current trainer is selected to initialize
-// the pserver parameters.
-func (e *Etcd) Select() (bool, error) {
-	sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
-	if err != nil {
-		return false, err
-	}
-
-	lock := concurrency.NewMutex(sess, initLockPath)
-	log.Info("Trying to acquire lock", log.Ctx{"lock path": initLockPath})
-	// Do not use timeout context here, since we don't know how
-	// long does it take for other trainers to initialize the
-	// parameters.
-	err = lock.Lock(context.Background())
-	if err != nil {
-		return false, err
-	}
-	log.Info("Successfully acquired lock", log.Ctx{"lock path": initLockPath})
-
-	get := clientv3.OpGet(initDonePath)
-	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-	tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
-	cancel()
-	if err != nil {
-		return false, err
-	}
-
-	if !tresp.Succeeded {
-		return false, errors.New("no longer the owner of the lock")
-	}
-
-	resp := tresp.Responses[0].GetResponseRange()
-
-	if len(resp.Kvs) == 0 {
-		// Key value not set, select current trainer.
-		e.lock = lock
-		log.Info("Trainer selected.")
-		return true, nil
-	}
-
-	if string(resp.Kvs[0].Value) == initDoneVal {
-		log.Info("Initialization is already done.")
-		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
-		err = lock.Unlock(ctx)
-		cancel()
-		if err != nil {
-			log.Error("error unlocking", log.Ctx{"error": err})
-		}
-		return false, nil
-	}
-
-	return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
-}
-
-// Done indicates the parameter initialization process is done.
-func (e *Etcd) Done() error {
-	if e.lock == nil {
-		return errors.New("lock is nil, Done called unexpectedly")
-	}
-
-	put := clientv3.OpPut(initDonePath, initDoneVal)
-	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-	tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
-	cancel()
-	if err != nil {
-		return err
-	}
-
-	if !tresp.Succeeded {
-		return errors.New("no longer the owner of the lock")
-	}
-
-	ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
-	err = e.lock.Unlock(ctx)
-	cancel()
-	if err != nil {
-		log.Error("error unlocking", log.Ctx{"error": err})
-	} else {
-		e.lock = nil
-	}
-
-	return nil
-}
-
-// Close closes the etcd client.
-func (e *Etcd) Close() error {
-	var err error
-	if e.lock != nil {
-		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
-		err = e.lock.Unlock(ctx)
-		cancel()
-		if err == nil {
-			e.lock = nil
-		}
-	}
-
-	cErr := e.client.Close()
-	if cErr != nil {
-		if err != nil {
-			log.Error("error closing etcd client", log.Ctx{"error": cErr})
-			return err
-		}
-		return cErr
-	}
-
-	return err
-}
diff --git a/go/pserver/client/etcd_client_test.go b/go/pserver/client/etcd_client_test.go
deleted file mode 100644
index 08742433e..000000000
--- a/go/pserver/client/etcd_client_test.go
+++ /dev/null
@@ -1,106 +0,0 @@
-package client_test
-
-import (
-	"io/ioutil"
-	"net/url"
-	"os"
-	"strings"
-	"sync"
-	"testing"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver/client"
-	"github.com/coreos/etcd/embed"
-)
-
-func TestSelector(t *testing.T) {
-	etcdDir, err := ioutil.TempDir("", "")
-	if err != nil {
-		t.Fatal(err)
-	}
-	cfg := embed.NewConfig()
-	lpurl, _ := url.Parse("http://localhost:0")
-	lcurl, _ := url.Parse("http://localhost:0")
-	cfg.LPUrls = []url.URL{*lpurl}
-	cfg.LCUrls = []url.URL{*lcurl}
-	cfg.Dir = etcdDir
-	e, err := embed.StartEtcd(cfg)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	defer func() {
-		e.Close()
-		if err := os.RemoveAll(etcdDir); err != nil {
-			t.Fatal(err)
-		}
-	}()
-
-	<-e.Server.ReadyNotify()
-
-	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
-	endpoint := "127.0.0.1:" + port
-
-	var mu sync.Mutex
-	selectedCount := 0
-	var wg sync.WaitGroup
-	selectAndDone := func(c *client.Etcd) {
-		defer wg.Done()
-
-		selected, err := c.Select()
-		if err != nil {
-			panic(err)
-		}
-
-		if selected {
-			mu.Lock()
-			selectedCount++
-			mu.Unlock()
-			err = c.Done()
-			if err != nil {
-				t.Fatal(err)
-			}
-		}
-	}
-
-	c0 := client.NewEtcd(endpoint)
-	c1 := client.NewEtcd(endpoint)
-	c2 := client.NewEtcd(endpoint)
-	c3 := client.NewEtcd(endpoint)
-	wg.Add(3)
-	go selectAndDone(c0)
-	go selectAndDone(c1)
-	go selectAndDone(c2)
-	wg.Wait()
-
-	// simulate trainer crashed and restarted after the
-	// initialization process.
-	wg.Add(1)
-	go selectAndDone(c3)
-	wg.Wait()
-
-	mu.Lock()
-	if selectedCount != 1 {
-		t.Fatal("selected count wrong:", selectedCount)
-	}
-	mu.Unlock()
-
-	err = c0.Close()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = c1.Close()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = c2.Close()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = c3.Close()
-	if err != nil {
-		t.Fatal(err)
-	}
-}
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
deleted file mode 100644
index 719013b1b..000000000
--- a/go/pserver/etcd_client.go
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver
-
-import (
-	"context"
-	"errors"
-	"strconv"
-	"strings"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
-	"github.com/coreos/etcd/clientv3"
-	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/inconshreveable/log15"
-)
-
-const (
-	// PsDesired is etcd path for store desired pserver count
-	PsDesired = "/ps_desired"
-	// PsPath is the base dir for pserver to store their addr
-	PsPath = "/ps/"
-	// PsCheckpoint is the etcd path for store checkpoints information
-	PsCheckpoint = "/checkpoints/"
-
-	retryTimeout = 5 * time.Second
-)
-
-// EtcdClient is the etcd client that the pserver uses for fault
-// tolerance, service registry and coordination.
-type EtcdClient struct {
-	numPservers int
-	endpoints   string
-	client      *clientv3.Client
-	sess        *concurrency.Session
-	dialTimeout time.Duration
-	ttlSec      int
-	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
-	externalIP string
-	// desired number of pservers in the job.
-	// assume desired will not change during one training job.
-	desired int
-}
-
-// NewEtcdClient creates an EtcdClient
-func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient {
-	return &EtcdClient{
-		dialTimeout: dialtimeout,
-		ttlSec:      ttlSec,
-		numPservers: numPservers,
-		endpoints:   endpoints,
-	}
-}
-
-// Register registers the pserver on etcd
-//
-// Register returns the index of the current pserver.
-func (e *EtcdClient) Register(port int) (int, error) {
-	var err error
-	e.externalIP, err = networkhelper.GetExternalIP()
-	if err != nil {
-		return 0, err
-	}
-
-	// initialize connection to etcd.
-	ep := strings.Split(e.endpoints, ",")
-	for {
-		cli, err := clientv3.New(clientv3.Config{
-			Endpoints:   ep,
-			DialTimeout: e.dialTimeout,
-		})
-		if err != nil {
-			log.Error("connect to etcd error", log.Ctx{"error": err})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		e.client = cli
-		sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec))
-		if err != nil {
-			log.Error("create etcd session error", log.Ctx{"error": err})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		e.sess = sess
-		log.Debug("connected to etcd", log.Ctx{"endpoint": e.endpoints})
-		break
-	}
-	// init /ps_desired using transaction, for multiple pservers may want to write
-	// it at the same time.
-	for {
-		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		_, err := e.initDesiredPservers(ctx, e.numPservers)
-		cancel()
-		if err != nil {
-			log.Warn("pserver init error", log.Ctx{"error": err, "num pservers": e.numPservers})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		break
-	}
-	// TODO: when implementing extending or reducing pservers, /ps_desired is
-	// changed, then we need to watch /ps_desired node for events. For now, just
-	// write once when init and read from it.
-	// wait and set s.desired init value
-	for {
-		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		resp, err := e.client.Get(ctx, PsDesired)
-		cancel()
-		if err != nil {
-			log.Error("get etcd key error", log.Ctx{"key": PsDesired, "error": err})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		if len(resp.Kvs) != 0 {
-			e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
-			if err != nil {
-				log.Error(
-					"psDesired atoi error",
-					log.Ctx{"error": err, "value": string(resp.Kvs[0].Value)},
-				)
-				time.Sleep(retryTimeout)
-				// NOTE: wait util ps_desired value change
-				continue
-			}
-			break
-		}
-	}
-
-	var pserverIdx int
-	// try register pserver node on etcd
-	for {
-		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx, port)
-		cancel()
-		if err != nil {
-			log.Warn("register pserver on etcd error", log.Ctx{"error": err})
-			time.Sleep(retryTimeout)
-			continue
-		}
-		break
-	}
-
-	return pserverIdx, nil
-}
-
-func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
-	return concurrency.NewSTM(e.client, func(c concurrency.STM) error {
-		dsStr := c.Get(PsDesired)
-		if dsStr == "" {
-			c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease()))
-		}
-		return nil
-	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
-}
-
-// registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
-	var idx int
-	_, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error {
-		registered := false
-		for i := 0; i < e.desired; i++ {
-			psKey := PsPath + strconv.Itoa(i)
-			ps := c.Get(psKey)
-			log.Debug(
-				"register pserver got value",
-				log.Ctx{"value": ps, "key": psKey},
-			)
-
-			if ps == "" {
-				// find the first id and write info
-				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
-				c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease()))
-				log.Debug("register finished", log.Ctx{"key": psKey, "value": pserverAddr})
-				idx = i
-				registered = true
-				break
-			}
-		}
-		if registered {
-			return nil
-		}
-		return errors.New("not registered, may due to already have enough pservers")
-	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
-
-	if err != nil {
-		return 0, err
-	}
-
-	return idx, nil
-}
-
-// GetKey gets the value by the specified key
-func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	resp, err := e.client.Get(ctx, key)
-	cancel()
-	if err != nil {
-		return []byte{}, err
-	}
-
-	kvs := resp.Kvs
-	if len(kvs) == 0 {
-		return []byte{}, nil
-	}
-	v := kvs[0].Value
-	return v, nil
-}
-
-// PutKey put into etcd with value by key specified
-func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
-	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	var err error
-	if withLease {
-		_, err = e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
-	} else {
-		_, err = e.client.Put(ctx, key, string(value))
-	}
-	cancel()
-	return err
-}
-
-// Shutdown shuts down the etcd client gracefully.
-func (e *EtcdClient) Shutdown() error {
-	var err error
-	if e.sess != nil {
-		err = e.sess.Close()
-	}
-
-	if e.client != nil {
-		newErr := e.client.Close()
-		if newErr != nil {
-			if err != nil {
-				log.Error("shutdown error", log.Ctx{"error": newErr})
-			} else {
-				err = newErr
-			}
-		}
-	}
-	return err
-}
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
deleted file mode 100644
index eba0c47e1..000000000
--- a/go/pserver/optimizer.go
+++ /dev/null
@@ -1,132 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver
-
-// #cgo CFLAGS: -I ../../
-// #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
-// #include "paddle/legacy/optimizer/optimizer.h"
-// #include <stdlib.h>
-// #include <string.h>
-import "C"
-
-import (
-	"fmt"
-	"unsafe"
-
-	log "github.com/inconshreveable/log15"
-)
-
-type optimizer struct {
-	opt         *C.struct_paddle_optimizer
-	elementType ElementType
-	contentLen  int
-	config      []byte
-}
-
-func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nil {
-		return nil
-	}
-
-	// create a Go clice backed by a C array, reference:
-	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-	//
-	// Go garbage collector will not interact with this data, need
-	// to be freed properly.
-	return (*[1 << 30]byte)(p)[:len:len]
-}
-
-func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
-	o := &optimizer{}
-	o.elementType = paramWithConfigs.Param.ElementType
-	o.contentLen = len(paramWithConfigs.Param.Content)
-	p := paramWithConfigs.Param
-	c := paramWithConfigs.Config
-	s := State
-	paramBufferSize := C.size_t(len(p.Content))
-	log.Info("New Optimizer Created with config", log.Ctx{
-		"ElementType": p.ElementType,
-		"ParamSize":   paramBufferSize,
-		"ConfigSize":  len(c),
-		"StateSize":   len(s),
-	})
-	var cbuffer unsafe.Pointer
-	cbuffer = C.malloc(paramBufferSize)
-
-	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), paramBufferSize)
-	var cstate unsafe.Pointer
-	if len(s) != 0 {
-		cstate = unsafe.Pointer(&s[0])
-	}
-
-	var cptr (*C.uchar)
-	if len(c) > 0 {
-		cptr = (*C.uchar)(&c[0])
-	} else {
-		log.Error("empty config", "param name", paramWithConfigs.Param.Name)
-	}
-	o.config = c
-	o.opt = C.paddle_create_optimizer(
-		cptr,
-		C.int(len(c)),
-		C.paddle_element_type(p.ElementType),
-		cbuffer,
-		C.int(paramBufferSize),
-		(*C.char)(cstate),
-		C.int(len(s)),
-	)
-	return o
-}
-
-func (o *optimizer) GetWeights() []byte {
-	var buffer unsafe.Pointer
-	// we do not own the buffer, no need to free later.
-	bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
-	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
-}
-
-func (o *optimizer) GetStates() []byte {
-	var cbuffer *C.char
-	// we owns the state buffer, need to free later.
-	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
-	buf := cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
-	cpy := make([]byte, len(buf))
-	copy(cpy, buf)
-	C.free(unsafe.Pointer(cbuffer))
-	return cpy
-}
-
-func (o *optimizer) UpdateParameter(g Gradient) error {
-	if o.elementType != g.ElementType {
-		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
-	}
-
-	if o.contentLen != len(g.Content) {
-		return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
-	}
-
-	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
-	if r != 0 {
-		return fmt.Errorf("optimizer update returned error code: %d", r)
-	}
-	return nil
-}
-
-func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nil {
-		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nil)
-	}
-}
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
deleted file mode 100644
index 3b923879d..000000000
--- a/go/pserver/optimizer_test.go
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver
-
-import (
-	"encoding/binary"
-	"io/ioutil"
-	"math"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-)
-
-func TestOptimizerCreateRelease(t *testing.T) {
-	p := Parameter{
-		Name:        "a",
-		ElementType: Int32,
-	}
-	p.Content = []byte{1, 3}
-	config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-	param := ParameterWithConfig{
-		Param:  p,
-		Config: config,
-	}
-	o := newOptimizer(param, nil)
-	o.Cleanup()
-}
-
-func float32Bytes(float float32) []byte {
-	bits := math.Float32bits(float)
-	bytes := make([]byte, 4)
-	binary.LittleEndian.PutUint32(bytes, bits)
-	return bytes
-}
-
-func TestOptimizerState(t *testing.T) {
-	p := Parameter{
-		Name:        "a",
-		ElementType: Int32,
-	}
-	weights := float32Bytes(100)
-	p.Content = weights
-	config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-	param := ParameterWithConfig{
-		Param:  p,
-		Config: config,
-	}
-	o := newOptimizer(param, nil)
-	s := o.GetStates()
-
-	// clear param content and check if the state is restored.
-	param.Param.Content = float32Bytes(300)
-	o1 := newOptimizer(param, s)
-	s1 := o1.GetStates()
-	assert.Equal(t, s, s1)
-	assert.Equal(t, weights, o.GetWeights())
-	assert.Equal(t, weights, o1.GetWeights())
-	o.Cleanup()
-	o1.Cleanup()
-}
diff --git a/go/pserver/service.go b/go/pserver/service.go
deleted file mode 100644
index d6ead774a..000000000
--- a/go/pserver/service.go
+++ /dev/null
@@ -1,450 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver
-
-import (
-	"bufio"
-	"bytes"
-	"encoding/binary"
-	"encoding/gob"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"hash/crc32"
-	"io/ioutil"
-	"os"
-	"path"
-	"strconv"
-	"strings"
-	"sync"
-	"time"
-
-	"github.com/golang/protobuf/proto"
-	uuid "github.com/satori/go.uuid"
-
-	pb "github.com/PaddlePaddle/Paddle/go/proto"
-
-	log "github.com/inconshreveable/log15"
-)
-
-// ElementType is the type of elements of a Parameter.
-type ElementType int
-
-// ErrCheckpointNotFound indicates that the pserver checkpoint could
-// not be found.
-var ErrCheckpointNotFound = errors.New("checkpoint not found in etcd")
-
-// RPC error message.
-const (
-	AlreadyInitialized = "pserver already initialized"
-	Uninitialized      = "pserver not fully initialized"
-	WrongChecksum      = "checkpoint file checksum validation failed"
-)
-
-// Supported element types.
-const (
-	Int32 ElementType = iota
-	UInt32
-	Int64
-	UInt64
-	Float32
-	Float64
-)
-
-// Parameter is a piece of data to sync with the parameter server.
-type Parameter struct {
-	Name        string
-	ElementType ElementType
-	Content     []byte
-}
-
-func float32ToString(b []byte) string {
-	f := make([]float32, len(b)/4)
-	buf := bytes.NewReader(b)
-	err := binary.Read(buf, binary.LittleEndian, &f)
-	if err != nil {
-		return ""
-	}
-	return fmt.Sprintf("%v", f)
-}
-
-func float32ByteToString(c []byte) string {
-	var a []byte
-	var b []byte
-	if len(c) <= 80 {
-		a = c
-	} else {
-		a = c[0:40]
-		b = c[len(c)-40:]
-	}
-
-	var s string
-	s = float32ToString(a)
-
-	if b == nil {
-		return s
-	}
-
-	s = strings.Replace(s, "]", "", -1) + "..." + strings.Replace(float32ToString(b), "[", "", -1)
-	return s
-}
-
-func (p Parameter) String() string {
-	if p.ElementType != Float32 {
-		return fmt.Sprintf("name:%v ElementType:%v",
-			p.Name, p.ElementType)
-	}
-
-	return float32ByteToString(p.Content)
-}
-
-// ParameterWithConfig contains the parameter and the configuration.
-type ParameterWithConfig struct {
-	Param  Parameter
-	Config []byte // parameter configuration in Proto Buffer format
-}
-
-// checkpointMeta saves checkpoint metadata
-type checkpointMeta struct {
-	UUID      string `json:"uuid"`
-	Path      string `json:"path"`
-	CRC32     uint32 `json:"crc32"`
-	Timestamp int64  `json:"timestamp"`
-}
-
-// Checkpoint is the pserver shard persist in file.
-type Checkpoint []parameterCheckpoint
-
-// Gradient is the gradient of the parameter.
-type Gradient Parameter
-
-// Service is the RPC service for pserver.
-type Service struct {
-	initialized        chan struct{}
-	idx                int
-	checkpointInterval time.Duration
-	checkpointPath     string
-	client             KVStore
-
-	mu     sync.Mutex
-	optMap map[string]*optimizer
-}
-
-// parameterCheckpoint saves parameter checkpoint.
-type parameterCheckpoint struct {
-	ParameterWithConfig
-	State []byte
-}
-
-type KVStore interface {
-	GetKey(key string, timeout time.Duration) ([]byte, error)
-	PutKey(key string, value []byte, timeout time.Duration, withLease bool) error
-}
-
-func loadMeta(e KVStore, idx int) (meta checkpointMeta, err error) {
-	v, err := e.GetKey(PsCheckpoint+strconv.Itoa(idx), 3*time.Second)
-	if err != nil {
-		return
-	}
-
-	if len(v) == 0 {
-		err = ErrCheckpointNotFound
-		return
-	}
-
-	if err = json.Unmarshal(v, &meta); err != nil {
-		return
-	}
-
-	return
-}
-
-// LoadCheckpoint loads checkpoint from file.
-func LoadCheckpoint(e KVStore, idx int) (Checkpoint, error) {
-	log.Info("Loading checkpoint", "pserver index", idx)
-	defer traceTime(time.Now(), "load checkpoint")
-
-	cpMeta, err := loadMeta(e, idx)
-	if err != nil {
-		return nil, err
-	}
-
-	content, err := ioutil.ReadFile(cpMeta.Path)
-	if err != nil {
-		return nil, err
-	}
-
-	crc32 := crc32.ChecksumIEEE(content)
-	if crc32 != cpMeta.CRC32 {
-		return nil, errors.New(WrongChecksum)
-	}
-
-	dec := gob.NewDecoder(bytes.NewReader(content))
-	var cp Checkpoint
-	if err = dec.Decode(&cp); err != nil {
-		return nil, err
-	}
-
-	return cp, nil
-}
-
-// NewService creates a new service, will bypass etcd registration if no
-// endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
-func NewService(idx int, interval time.Duration, path string, client KVStore, cp Checkpoint) (*Service, error) {
-	s := &Service{
-		idx:                idx,
-		checkpointInterval: interval,
-		checkpointPath:     path,
-		client:             client,
-	}
-	s.optMap = make(map[string]*optimizer)
-	s.initialized = make(chan struct{})
-
-	if cp != nil {
-		for _, item := range cp {
-			p := ParameterWithConfig{
-				Param:  item.Param,
-				Config: item.Config,
-			}
-			s.optMap[p.Param.Name] = newOptimizer(p, item.State)
-		}
-		close(s.initialized)
-	}
-	return s, nil
-}
-
-// InitParam initializes a parameter.
-func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
-	select {
-	case <-s.initialized:
-		log.Warn("init param called but parameters already initialized.")
-		return errors.New(AlreadyInitialized)
-	default:
-	}
-
-	c := &pb.OptimizerConfig{}
-	proto.Unmarshal(paramWithConfigs.Config, c)
-	log.Debug(fmt.Sprintf("OptimizerConfig:%v", c))
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	// TODO(helin): check if paramWithConfigs.Param.Content is
-	// properly memory aligned, if not, make copy to a memory
-	// aligned region.
-	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
-	log.Info(
-		"init parameter",
-		"name", paramWithConfigs.Param.Name,
-		"config len", len(paramWithConfigs.Config),
-		"param len", len(paramWithConfigs.Param.Content),
-		"type", paramWithConfigs.Param.ElementType,
-	)
-	return nil
-}
-
-// FinishInitParams tells the parameter server that the parameter
-// initialization has finished.
-func (s *Service) FinishInitParams(_ int, _ *int) error {
-	select {
-	case <-s.initialized:
-		log.Warn("finished init param called but parameters already initialized.")
-		return errors.New(AlreadyInitialized)
-	default:
-	}
-
-	close(s.initialized)
-	go func() {
-		t := time.Tick(s.checkpointInterval)
-		for range t {
-			err := s.checkpoint()
-			if err != nil {
-				log.Error("checkpoint error", log.Ctx{"error": err})
-			}
-		}
-	}()
-
-	log.Info("init parameter finished.")
-	return nil
-}
-
-// SendGrad sends gradient to parameter servers for parameter
-// optimization.
-func (s *Service) SendGrad(g Gradient, _ *int) error {
-	select {
-	case <-s.initialized:
-	default:
-		log.Warn("received gradient before initialization.",
-			"name", g.Name, "size", len(g.Content), "type", g.ElementType)
-		return errors.New(Uninitialized)
-	}
-
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	o, ok := s.optMap[g.Name]
-	if !ok {
-		log.Warn("received gradient but can't find name.",
-			"name", g.Name, "size", len(g.Content), "type", g.ElementType)
-		return fmt.Errorf("parameter: %s does not exist", g.Name)
-	}
-
-	log.Debug(Parameter(g).String())
-	log.Info("received gradient from trainer, updating gradient.",
-		"name", g.Name, "size", len(g.Content), "type", g.ElementType)
-	return o.UpdateParameter(g)
-}
-
-// GetParam gets parameters from the parameter server.
-func (s *Service) GetParam(name string, parameter *Parameter) error {
-	<-s.initialized
-	s.mu.Lock()
-	defer s.mu.Unlock()
-
-	opt, ok := s.optMap[name]
-	if !ok {
-		log.Warn("trainer wants to get a parameter that does not exist.", "name", name)
-		return fmt.Errorf("parameter: %s does not exist", name)
-	}
-
-	// The parameter content (a byte slice) may change
-	// during RPC serialization due to write from other
-	// goroutine, we allow it since mini-batch based deep
-	// learning optimization methods are stochastic in
-	// nature. This race condition is allowed deliberately
-	// to save the program from making a copy of the
-	// parameter content.
-	parameter.Name = name
-	parameter.ElementType = opt.elementType
-	parameter.Content = opt.GetWeights()
-	log.Debug(parameter.String())
-	log.Info("sending parameter to the trainer", "name", parameter.Name, "size", len(parameter.Content), "type", parameter.ElementType)
-	return nil
-}
-
-func traceTime(start time.Time, name string) {
-	elapsed := time.Since(start)
-	log.Info("time elapsed", log.Ctx{"name": name, "elapsed": elapsed})
-}
-
-// checkpoint saves checkpoint to disk.
-//
-// checkpoint should be only called after the parameters are
-// initialized.
-func (s *Service) checkpoint() (err error) {
-	log.Info("Begin save checkpoint.")
-	defer traceTime(time.Now(), "save checkpoint")
-
-	s.mu.Lock()
-	cp := make([]parameterCheckpoint, len(s.optMap))
-	index := 0
-	// TODO(helin): write checkpoint incrementally to reduce memory
-	// footprint during checkpoint.
-	for name, opt := range s.optMap {
-		var pc parameterCheckpoint
-		pc.Param.Name = name
-		pc.Param.ElementType = opt.elementType
-		pc.Param.Content = opt.GetWeights()
-		pc.Config = opt.config
-		pc.State = opt.GetStates()
-		cp[index] = pc
-		index++
-	}
-	s.mu.Unlock()
-
-	var buf bytes.Buffer
-	encoder := gob.NewEncoder(&buf)
-	err = encoder.Encode(cp)
-	if err != nil {
-		return
-	}
-
-	if _, err = os.Stat(s.checkpointPath); os.IsNotExist(err) {
-		err = os.MkdirAll(s.checkpointPath, os.ModePerm)
-		if err != nil {
-			return
-		}
-	}
-
-	id := uuid.NewV4().String()
-	p := path.Join(s.checkpointPath, id)
-	f, err := os.Create(p)
-	if err != nil {
-		return
-	}
-
-	defer func() {
-		closeErr := f.Close()
-		if closeErr != nil {
-			if err != nil {
-				log.Error("error close checkpoint file", log.Ctx{"error": closeErr})
-			} else {
-				// Set closeErr as return value.
-				err = closeErr
-			}
-		}
-	}()
-
-	writer := bufio.NewWriter(f)
-	_, err = writer.Write(buf.Bytes())
-	if err != nil {
-		return
-	}
-
-	err = writer.Flush()
-	if err != nil {
-		return
-	}
-
-	oldMeta, err := loadMeta(s.client, s.idx)
-	if err == ErrCheckpointNotFound {
-		log.Info("old meta not found, skip removing old meta")
-		err = nil
-	} else if err == nil {
-		log.Info("removing old meta")
-		if oldMeta.Path != "" {
-			rmErr := os.Remove(oldMeta.Path)
-			if rmErr != nil {
-				// log error, but still treat checkpoint as
-				// successful.
-				log.Error("remove old meta file error", log.Ctx{"error": rmErr})
-			}
-		}
-	}
-
-	if err != nil {
-		return
-	}
-
-	crc32 := crc32.ChecksumIEEE(buf.Bytes())
-	cpMeta := checkpointMeta{
-		UUID:      id,
-		Timestamp: time.Now().UnixNano(),
-		CRC32:     crc32,
-		Path:      p,
-	}
-
-	json, err := json.Marshal(cpMeta)
-	if err != nil {
-		return
-	}
-
-	err = s.client.PutKey(PsCheckpoint+strconv.Itoa(s.idx), json, 3*time.Second, false)
-	if err != nil {
-		return
-	}
-
-	return
-}
diff --git a/go/pserver/service_internal_test.go b/go/pserver/service_internal_test.go
deleted file mode 100644
index 36eca5112..000000000
--- a/go/pserver/service_internal_test.go
+++ /dev/null
@@ -1,86 +0,0 @@
-package pserver
-
-import (
-	"bytes"
-	"encoding/binary"
-	"fmt"
-	"testing"
-	"time"
-
-	"github.com/stretchr/testify/assert"
-)
-
-const testDir = "./test_data"
-
-type myKV struct {
-	m map[string][]byte
-}
-
-func (m *myKV) GetKey(key string, timeout time.Duration) ([]byte, error) {
-	if m.m == nil {
-		m.m = make(map[string][]byte)
-	}
-	return m.m[key], nil
-}
-
-func (m *myKV) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
-	if m.m == nil {
-		m.m = make(map[string][]byte)
-	}
-	m.m[key] = value
-	return nil
-}
-
-func TestCheckpoint(t *testing.T) {
-	kv := &myKV{}
-	s, err := NewService(0, time.Hour, testDir, kv, nil)
-	assert.Nil(t, err)
-	err = s.checkpoint()
-	assert.Nil(t, err)
-	_, err = LoadCheckpoint(kv, 0)
-	assert.Nil(t, err)
-}
-
-func float32ToByte(f float32) []byte {
-	var buf bytes.Buffer
-	err := binary.Write(&buf, binary.LittleEndian, f)
-	if err != nil {
-		fmt.Println("binary.Write failed:", err)
-	}
-	return buf.Bytes()
-}
-
-func TestCheckpointWithData(t *testing.T) {
-	kv := &myKV{}
-	s, err := NewService(0, time.Hour, testDir, kv, nil)
-	assert.Nil(t, err)
-
-	var content []byte
-	for i := 0; i < 50000; i++ {
-		content = append(content, float32ToByte(float32(i))...)
-	}
-
-	p1 := Parameter{Name: "p1", ElementType: 1, Content: content}
-	err = s.InitParam(ParameterWithConfig{Param: p1}, nil)
-	assert.Nil(t, err)
-
-	err = s.FinishInitParams(0, nil)
-	assert.Nil(t, err)
-
-	var p2 Parameter
-	err = s.GetParam(p1.Name, &p2)
-	assert.Nil(t, err)
-	assert.Equal(t, p1, p2)
-
-	err = s.checkpoint()
-	assert.Nil(t, err)
-	cp, err := LoadCheckpoint(kv, 0)
-	assert.Nil(t, err)
-	s1, err := NewService(0, time.Hour, testDir, kv, cp)
-	assert.Nil(t, err)
-
-	var p3 Parameter
-	err = s1.GetParam(p1.Name, &p3)
-	assert.Nil(t, err)
-	assert.Equal(t, p1, p3)
-}
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
deleted file mode 100644
index 6949348e9..000000000
--- a/go/pserver/service_test.go
+++ /dev/null
@@ -1,211 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package pserver_test
-
-import (
-	"fmt"
-	"io/ioutil"
-	"reflect"
-	"sync"
-	"testing"
-	"time"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-)
-
-const (
-	OptimizerConfig = "./client/c/test/testdata/optimizer.pb"
-)
-
-func TestServiceFull(t *testing.T) {
-	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-	if err != nil {
-		t.Error(err)
-	}
-	var p pserver.Parameter
-	p.Name = "param_a"
-	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
-	p.ElementType = pserver.Int32
-	config, err := ioutil.ReadFile(OptimizerConfig)
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-
-	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var p1 pserver.Parameter
-	p1.Name = "param_b"
-	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
-	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = s.FinishInitParams(0, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var param pserver.Parameter
-	err = s.GetParam("param_b", &param)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if !reflect.DeepEqual(param, p1) {
-		t.Fatal("not equal:", param, p1)
-	}
-
-	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
-
-	err = s.SendGrad(g1, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-	err = s.SendGrad(g2, nil)
-
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var param1 pserver.Parameter
-	err = s.GetParam("param_a", &param1)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	// don't compare content, since it's already changed by
-	// gradient update.
-	param1.Content = nil
-	p.Content = nil
-
-	if !reflect.DeepEqual(param1, p) {
-		t.Fatal("not equal:", param1, p)
-	}
-}
-
-func TestMultipleInit(t *testing.T) {
-	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-	if err != nil {
-		t.Fatal(err)
-	}
-	err = s.FinishInitParams(0, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = s.FinishInitParams(0, nil)
-	if err.Error() != pserver.AlreadyInitialized {
-		t.Fatal(err)
-	}
-}
-
-func TestUninitialized(t *testing.T) {
-	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-	err = s.SendGrad(pserver.Gradient{}, nil)
-	if err.Error() != pserver.Uninitialized {
-		t.Fatal(err)
-	}
-}
-
-func TestBlockUntilInitialized(t *testing.T) {
-	var cp pserver.Checkpoint
-	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
-	if err != nil {
-		t.Error(err)
-	}
-	ch := make(chan struct{}, 2)
-	errCh := make(chan error, 2)
-	var wg sync.WaitGroup
-	wg.Add(1)
-	go func() {
-		var param pserver.Parameter
-		err := s.GetParam("param_a", &param)
-		if err != nil {
-			errCh <- err
-		}
-		wg.Done()
-		ch <- struct{}{}
-	}()
-
-	time.Sleep(50 * time.Millisecond)
-
-	select {
-	case <-ch:
-		// some function returned before initialization is completed.
-		t.FailNow()
-	case <-errCh:
-		t.FailNow()
-	default:
-	}
-
-	var p pserver.Parameter
-	p.Name = "param_a"
-	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
-	p.ElementType = pserver.Int32
-	config, err := ioutil.ReadFile(OptimizerConfig)
-	if err != nil {
-		t.Fatalf("read optimizer proto failed")
-	}
-	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
-
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	err = s.FinishInitParams(0, nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	wg.Wait()
-}
-
-func TestGradientString(t *testing.T) {
-	g := pserver.Parameter{}
-	g.ElementType = pserver.Float32
-	g.Content = []byte{0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40, 0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40}
-	if g.String() != "[3.3702806e+12 2.142699 3.3702806e+12 2.142699]" {
-		t.Fatal("get float data error!")
-	}
-
-	g.Content = []byte{0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
-		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40}
-	if g.String() != "[3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699...3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699]" {
-		t.Fatal("get float data error!", g.String())
-	}
-	fmt.Println(g)
-}
diff --git a/go/utils/networkhelper/CMakeLists.txt b/go/utils/networkhelper/CMakeLists.txt
deleted file mode 100644
index 3100f2b5a..000000000
--- a/go/utils/networkhelper/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-if(WITH_TESTING)
-  go_test(network_helper_test)
-endif()
diff --git a/go/utils/networkhelper/helper.go b/go/utils/networkhelper/helper.go
deleted file mode 100644
index d205b6c50..000000000
--- a/go/utils/networkhelper/helper.go
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package networkhelper
-
-import (
-	"errors"
-	"net"
-)
-
-// GetExternalIP returns the ip address of local network interface, not the
-// loopback device.
-func GetExternalIP() (string, error) {
-	ifaces, err := net.Interfaces()
-	if err != nil {
-		return "", err
-	}
-	for _, iface := range ifaces {
-		if iface.Flags&net.FlagUp == 0 {
-			continue // interface down
-		}
-		if iface.Flags&net.FlagLoopback != 0 {
-			continue // loopback interface
-		}
-		addrs, err := iface.Addrs()
-		if err != nil {
-			return "", err
-		}
-		for _, addr := range addrs {
-			var ip net.IP
-			switch v := addr.(type) {
-			case *net.IPNet:
-				ip = v.IP
-			case *net.IPAddr:
-				ip = v.IP
-			}
-			if ip == nil || ip.IsLoopback() {
-				continue
-			}
-			ip = ip.To4()
-			if ip == nil {
-				continue // not an ipv4 address
-			}
-			return ip.String(), nil
-		}
-	}
-	return "", errors.New("are you connected to the network?")
-}
diff --git a/go/utils/networkhelper/helper_test.go b/go/utils/networkhelper/helper_test.go
deleted file mode 100644
index 60b520fae..000000000
--- a/go/utils/networkhelper/helper_test.go
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-// http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package networkhelper
-
-import "testing"
-
-func TestGetIP(t *testing.T) {
-	_, err := GetExternalIP()
-	if err != nil {
-		t.Errorf("GetExternalIP returns error : %v\n", err)
-	}
-}
diff --git a/proto/.gitignore b/proto/.gitignore
deleted file mode 100644
index a0f00082c..000000000
--- a/proto/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-CMakeLists.txt
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
deleted file mode 100644
index a075eeb83..000000000
--- a/proto/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-if (MOBILE_INFERENCE)
-    file(GLOB proto_filenames . ModelConfig.proto ParameterConfig.proto
-         TrainerConfig.proto DataConfig.proto)
-else()
-    file(GLOB proto_filenames . *.proto)
-endif()
-
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-proto_library(paddle_proto SRCS ${proto_filenames})
-
-set(PROTO_GEN)
-set(PROTO_GEN_PY)
-
-foreach(filename ${proto_filenames})
-    get_filename_component(ABS_FIL ${filename} ABSOLUTE)
-    get_filename_component(FIL_WE ${filename} NAME_WE)
-    set(CUR_PROTO_GEN_PY
-            ${PADDLE_BINARY_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
-    set(PROTO_GEN_PY
-            ${CUR_PROTO_GEN_PY}
-            ${PROTO_GEN_PY})
-    add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-            COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/proto
-            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${PADDLE_BINARY_DIR}/python/paddle/proto"
-            "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-            DEPENDS ${ABS_FIL} protoc)
-endforeach()
-
-add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
-
-
-if (WITH_GOLANG)
-    add_custom_target(protoc-gen-go)
-    add_custom_command(TARGET protoc-gen-go
-            COMMAND go 
-            ARGS "get" "-u" "github.com/golang/protobuf/protoc-gen-go")
-
-    set(PROTO_GEN_GO)
-    file(GLOB proto_filenames . OptimizerConfig.proto)
-    foreach(filename ${proto_filenames})
-        message(STATUS ${filename})
-        get_filename_component(ABS_FIL ${filename} ABSOLUTE)
-        get_filename_component(FIL_WE ${filename} NAME_WE)
-        set(CUR_PROTO_GEN_GO
-                ${PADDLE_SOURCE_DIR}/paddle/go/proto/${FIL_WE}.pb.go)
-        set(PROTO_GEN_GO
-                ${CUR_PROTO_GEN_GO}
-                ${PROTO_GEN_GO})
-        add_custom_command(OUTPUT ${CUR_PROTO_GEN_GO}
-                COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-                ARGS "--go_out=${PADDLE_SOURCE_DIR}/go/proto"
-                "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-                DEPENDS ${ABS_FIL} protoc protoc-gen-go)
-    endforeach()
-    add_custom_target(gen_proto_go ALL DEPENDS ${PROTO_GEN_GO})
-endif()
diff --git a/proto/DataConfig.proto b/proto/DataConfig.proto
deleted file mode 100644
index 1b2aa8e72..000000000
--- a/proto/DataConfig.proto
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-package paddle;
-
-message FileGroupConf {
-  optional uint32 queue_capacity = 1 [ default = 1 ];
-  // how many files to load for a load file thread
-  optional int32 load_file_count = 2 [ default = 1 ];
-  // how many threads to load files
-  // Setting to be 5~10 is appropriate when loading files by hadoop vfs
-  optional int32 load_thread_num = 3 [ default = 1 ];
-};
-
-message DataConfig {
-
-  required string type = 1;
-
-  // name of a text file which contains a list of file names at each line
-  optional string files = 3;
-
-  optional int32 feat_dim = 4;         // feature dimension of one frame
-  repeated int32 slot_dims = 5;        // feature slot dims
-  optional int32 context_len = 6;      // max neibour frame numbers
-  optional uint64 buffer_capacity = 7; // the number of samples
-
-  // part of data used in training
-  // if not -1, part of train data is used in training
-  optional int64 train_sample_num = 8 [ default = -1 ];
-
-  // The number of documents processed once
-  optional int32 file_load_num = 9 [ default = -1 ];
-  optional bool async_load_data = 12 [ default = false ];
-  /// Note the field number 10, 11 and 13 have been deprecated.
-  optional bool for_test = 14
-      [ default = false ]; // whether this data is for test
-  optional FileGroupConf file_group_conf = 15;
-  repeated int32 float_slot_dims = 16;
-
-  /// Note the field number 17, 18 and 19 have been deprecated.
-
-  // a list of values which will be used to create additional one dimensional
-  // float
-  // values slots. These one dimensional slots can be used as the weight input
-  // for cost layers.
-  // Currently this is only supported by ProtoDataProvider.
-  repeated double constant_slots = 20;
-
-  // for PyDataProvider.
-  // Specify the load data script module name, object name and user args
-  optional string load_data_module = 21;
-  optional string load_data_object = 22;
-  optional string load_data_args = 23;
-
-  // for MultiDataProvider
-  repeated DataConfig sub_data_configs = 24; // sub dataproviders
-                                             /*
-                                              * the ratio of each sub dataproviders:
-                                              * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
-                                              * then each mini-batch is combined by 10 instance from A and 90 instances
-                                              * from B.
-                                              */
-  optional int32 data_ratio = 25;
-  /*
-   * if one of the sub dataproviders is running out of data, then
-   * (1) it is "main data", then finish current pass.
-   * (2) it is not "main data", then reset it, and try getNextBatch again.
-   */
-  optional bool is_main_data = 26 [ default = true ];
-
-  // the usage ratio of instances. Setting to 1.0 means the use of all
-  // instances.
-  optional double usage_ratio = 27 [ default = 1.0 ];
-};
diff --git a/proto/DataFormat.proto b/proto/DataFormat.proto
deleted file mode 100644
index 46b1f58bd..000000000
--- a/proto/DataFormat.proto
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-package paddle;
-
-/*
- If values is not empty and ids is empty, this is a dense vector.
- If values is not empty and ids is not empty, this is a sparse vector. The
- position of each value
- is specified by ids.
- If values is empty and ids is not empty, this is a sparse vector whose non-zero
- values are 1.
- The position of each 1 is specified by ids.
-*/
-message VectorSlot {
-  repeated float values = 1 [ packed = true ];
-  repeated uint32 ids = 2 [ packed = true ];
-  /* For multidimensional data, for example "image width height depth" */
-  repeated uint32 dims = 3 [ packed = true ];
-  repeated string strs = 4;
-};
-
-/*
- SubseqSlot use to record whether VectorSlot or any other slot in future has
- subseq.
- If not all VectorSlot have subseq, we only store the one who has subseq, and
- use *slot_id* to record it.
- One vector_slots has one sequence, and it may have N subseq, thus the number of
- *lens* will be N too.
-*/
-message SubseqSlot {
-  required uint32 slot_id = 1; // the id of slot who has subseq
-  repeated uint32 lens = 2;    // lengths of sub-sequence in the slot
-};
-
-message SlotDef {
-  enum SlotType {
-    VECTOR_DENSE = 0;
-    VECTOR_SPARSE_NON_VALUE = 1;
-    VECTOR_SPARSE_VALUE = 2;
-    INDEX = 3; // This can be used as label, or word id, etc.
-    VAR_MDIM_DENSE = 4;
-    VAR_MDIM_INDEX = 5;
-    STRING = 6;
-  }
-  required SlotType type = 1;
-  required uint32 dim =
-      2; // For INDEX slots, this means the maximal index plus 1.
-};
-
-message DataHeader {
-  // INDEX slot should be always after VECTOR slots.
-  repeated SlotDef slot_defs = 1;
-};
-
-message DataSample {
-  optional bool is_beginning = 1
-      [ default = true ]; // is the beginning of a sequence
-  repeated VectorSlot vector_slots = 2;
-  repeated uint32 id_slots = 3 [ packed = true ];
-  /* use ids of VectorSlot */
-  repeated VectorSlot var_id_slots = 4;
-  repeated SubseqSlot subseq_slots = 5;
-};
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
deleted file mode 100644
index d78ee9c9f..000000000
--- a/proto/ModelConfig.proto
+++ /dev/null
@@ -1,698 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-import "ParameterConfig.proto";
-
-package paddle;
-
-/**
- * Various structs for the configuration of a neural network
- */
-
-message ExternalConfig {
-  repeated string layer_names = 1;
-  repeated string input_layer_names = 2;
-  repeated string output_layer_names = 3;
-}
-
-message ActivationConfig {
-  // identity: f(x) = x
-  // sigmoid: f(x) = 1 / (1 + exp(-x))
-  // logistic: f(x) = (1 - exp(-x)) / (1+ exp(-x))
-  // softmax: y_i = f(x_i) = exp(x_i) / (\sum_i exp(x_i))
-  // relu: y = max(0, x)
-  required string type = 1;
-};
-
-message ConvConfig {
-  // filter_size = 5, says that this layer will use
-  // filters of size 5x5 pixels.
-  required uint32 filter_size = 1;
-
-  // The image data dimensionality.
-  // This value must be either 1, 2, 3, or a multiple of 4.
-  required uint32 channels = 2;
-
-  // stride = 1, indicates that the distance between
-  // successive filter applications should be 1 pixel.
-  required uint32 stride = 3;
-
-  // padding = 4, instructs the net to implicitly
-  // pad the images with a 4-pixel border of zeros.
-  required uint32 padding = 4;
-
-  // If groups = 4 together with the filters = 32 parameter,
-  // they state that this convolutional layer is to have 4
-  // groups of 32 filters. Each filter will connect to 8
-  // input channels.
-  required uint32 groups = 5;
-  required uint32 filter_channels = 6;
-
-  // The size of output feature map.
-  required uint32 output_x = 7;
-
-  // The size of input feature map.
-  required uint32 img_size = 8;
-
-  // caffe mode for output size coherence
-  required bool caffe_mode = 9 [ default = true ];
-
-  // if filter_size_y is set , this convolutional layer will use
-  // filters of size filter_size * filter_size_y pixels.
-  // if filter_size_y is not set, this convolutional layer will use
-  // filters of size filter_size * filter_size
-  required uint32 filter_size_y = 10;
-  required uint32 padding_y = 11;
-  required uint32 stride_y = 12;
-
-  // if not set, use output_x
-  optional uint32 output_y = 13;
-
-  // if not set, use img_size
-  optional uint32 img_size_y = 14;
-
-  optional uint32 dilation = 15 [ default = 1 ];
-  optional uint32 dilation_y = 16 [ default = 1 ];
-
-  optional uint32 filter_size_z = 17 [ default = 1 ];
-  optional uint32 padding_z = 18 [ default = 1 ];
-  optional uint32 stride_z = 19 [ default = 1 ];
-  optional uint32 output_z = 20 [ default = 1 ];
-  optional uint32 img_size_z = 21 [ default = 1 ];
-}
-
-message PoolConfig {
-  // max or avg pooling
-  required string pool_type = 1;
-  required uint32 channels = 2;
-
-  // Defines the size of the pooling region in
-  // the x (equivalently, y) dimension.
-  required uint32 size_x = 3;
-
-  // Tell the net where in the input image to start the pooling.
-  // start is deprecated now.
-  optional uint32 start = 4;
-
-  // Defines the stride size between successive pooling squares.
-  required uint32 stride = 5 [ default = 1 ];
-
-  // The size of output feature map.
-  required uint32 output_x = 6;
-
-  // The size of input feature map.
-  required uint32 img_size = 7;
-
-  // padding = 4, instructs the net to implicitly
-  // pad the images with a 4-pixel border of zeros.
-  optional uint32 padding = 8 [ default = 0 ];
-
-  // if not set, use size_x
-  optional uint32 size_y = 9;
-
-  // if not set, use stride
-  optional uint32 stride_y = 10;
-
-  // if not set, use output_x
-  optional uint32 output_y = 11;
-
-  // if not set, use img_size
-  optional uint32 img_size_y = 12;
-
-  // if not set, use padding
-  optional uint32 padding_y = 13;
-
-  optional uint32 size_z = 14 [ default = 1 ];
-  optional uint32 stride_z = 15 [ default = 1 ];
-  optional uint32 output_z = 16 [ default = 1 ];
-  optional uint32 img_size_z = 17 [ default = 1 ];
-  optional uint32 padding_z = 18 [ default = 1 ];
-
-  optional bool exclude_mode = 19;
-}
-
-message SppConfig {
-  required ImageConfig image_conf = 1;
-  required string pool_type = 2;
-  required uint32 pyramid_height = 3;
-}
-
-message NormConfig {
-  // rnorm or cmrnorm
-  required string norm_type = 1;
-  required uint32 channels = 2;
-
-  // rnorm: this defines the size of the local regions
-  // used for response normalization.
-  // cmrnorm: The size parameter indicates how many
-  // nearby maps to use for normalization.
-  required uint32 size = 3;
-
-  // the parameters for normalization
-  // u = u / (1+scale*sum(u^2 in window))^pow
-  required double scale = 4;
-  required double pow = 5;
-
-  // The size of output feature map.
-  required uint32 output_x = 6;
-
-  // The size of input feature map.
-  required uint32 img_size = 7;
-
-  // normalize with fixed window or sliding window
-  // u = u / (1+scale*sum(u^2 in window))^pow
-  // fixed window: shared a fixed window for each value
-  // sliding window: have a different window for each value
-  optional bool blocked = 8;
-
-  // if not set, use output_x
-  optional uint32 output_y = 9;
-
-  // if not set, use img_size
-  optional uint32 img_size_y = 10;
-}
-
-message BlockExpandConfig {
-  required uint32 channels = 1;
-
-  required uint32 stride_x = 2;
-  required uint32 stride_y = 3;
-
-  required uint32 padding_x = 4;
-  required uint32 padding_y = 5;
-
-  required uint32 block_x = 6;
-  required uint32 block_y = 7;
-
-  // The size of output feature map.
-  required uint32 output_x = 8;
-  required uint32 output_y = 9;
-
-  // The size of input feature map.
-  required uint32 img_size_x = 10;
-  required uint32 img_size_y = 11;
-}
-
-message MaxOutConfig {
-  required ImageConfig image_conf = 1;
-  required uint32 groups = 2;
-}
-
-message RowConvConfig { required uint32 context_length = 1; }
-
-message SliceConfig {
-  required uint32 start = 1;
-  required uint32 end = 2;
-}
-
-message ProjectionConfig {
-  required string type = 1;
-  required string name = 2;
-  required uint64 input_size = 3;
-  required uint64 output_size = 4;
-
-  // For ShiftProjection
-  optional int32 context_start = 5;
-  optional int32 context_length = 6;
-  optional bool trainable_padding = 7 [ default = false ];
-
-  // For convolution
-  optional ConvConfig conv_conf = 8;
-  optional int32 num_filters = 9;
-
-  // For IdentityOffsetProjection
-  optional uint64 offset = 11 [ default = 0 ];
-
-  // For pool
-  optional PoolConfig pool_conf = 12;
-
-  // For slice
-  // Each slice output is the input[start, end)
-  repeated SliceConfig slices = 13;
-}
-
-message OperatorConfig {
-  required string type = 1;
-  repeated int32 input_indices = 2;
-  repeated uint64 input_sizes = 3;
-  required uint64 output_size = 4;
-
-  // For DotMulOperator
-  optional double dotmul_scale = 5 [ default = 1.0 ];
-
-  // For ConvOperator
-  optional ConvConfig conv_conf = 6;
-  optional int32 num_filters = 7;
-}
-
-message BilinearInterpConfig {
-  // The size of input feature map.
-  required ImageConfig image_conf = 1;
-  // The size of output feature map.
-  required uint32 out_size_x = 2;
-  required uint32 out_size_y = 3;
-}
-
-message ImageConfig {
-  // The image data dimensionality.
-  // This value must be either 1, 2, 3, or a multiple of 4.
-  required uint32 channels = 2;
-
-  // The size of input feature map.
-  required uint32 img_size = 8;
-  optional uint32 img_size_y = 9;
-  optional uint32 img_size_z = 10 [ default = 1 ];
-}
-
-message PriorBoxConfig {
-  repeated uint32 min_size = 1;
-  repeated uint32 max_size = 2;
-  repeated float aspect_ratio = 3;
-  repeated float variance = 4;
-}
-
-message PadConfig {
-  required ImageConfig image_conf = 1;
-  repeated uint32 pad_c = 2;
-  repeated uint32 pad_h = 3;
-  repeated uint32 pad_w = 4;
-}
-
-message ReshapeConfig {
-  repeated uint32 height_axis = 1;
-  repeated uint32 width_axis = 2;
-}
-
-message MultiBoxLossConfig {
-  required uint32 num_classes = 1;
-  required float overlap_threshold = 2;
-  required float neg_pos_ratio = 3;
-  required float neg_overlap = 4;
-  required uint32 background_id = 5;
-  required uint32 input_num = 6;
-  optional uint32 height = 7 [ default = 1 ];
-  optional uint32 width = 8 [ default = 1 ];
-}
-
-message DetectionOutputConfig {
-  required uint32 num_classes = 1;
-  required float nms_threshold = 2;
-  required uint32 nms_top_k = 3;
-  required uint32 background_id = 4;
-  required uint32 input_num = 5;
-  required uint32 keep_top_k = 6;
-  required float confidence_threshold = 7;
-  optional uint32 height = 8 [ default = 1 ];
-  optional uint32 width = 9 [ default = 1 ];
-}
-
-message ClipConfig {
-  required double min = 1;
-  required double max = 2;
-}
-
-message UpsampleConfig {
-  required ImageConfig image_conf = 1;
-  optional uint32 scale = 2 [ default = 2 ];
-  optional uint32 scale_y = 3 [ default = 2 ];
-  optional bool pad_out_x = 4 [ default = false ];
-  optional bool pad_out_y = 5 [ default = false ];
-  optional uint32 upsample_size = 6;
-  optional uint32 upsample_size_y = 7;
-}
-
-message ROIPoolConfig {
-  required uint32 pooled_width = 1;
-  required uint32 pooled_height = 2;
-  required float spatial_scale = 3;
-  optional uint32 height = 4 [ default = 1 ];
-  optional uint32 width = 5 [ default = 1 ];
-}
-
-message ScaleSubRegionConfig {
-  required ImageConfig image_conf = 1;
-  required float value = 2;
-}
-
-message LayerInputConfig {
-  required string input_layer_name = 1;
-  optional string input_parameter_name = 2;
-  optional ConvConfig conv_conf = 3;
-  optional PoolConfig pool_conf = 4;
-  optional NormConfig norm_conf = 5;
-  optional ProjectionConfig proj_conf = 6;
-  optional BlockExpandConfig block_expand_conf = 7;
-  optional ImageConfig image_conf = 8;
-  // If the input layer has multi-output.
-  // Set the argument name.
-  optional string input_layer_argument = 9;
-  optional BilinearInterpConfig bilinear_interp_conf = 10;
-  optional MaxOutConfig maxout_conf = 11;
-  optional SppConfig spp_conf = 12;
-  optional PriorBoxConfig priorbox_conf = 13;
-  optional PadConfig pad_conf = 14;
-  optional RowConvConfig row_conv_conf = 15;
-  optional MultiBoxLossConfig multibox_loss_conf = 16;
-  optional DetectionOutputConfig detection_output_conf = 17;
-  optional ClipConfig clip_conf = 18;
-  optional ScaleSubRegionConfig scale_sub_region_conf = 19;
-  optional ROIPoolConfig roi_pool_conf = 20;
-  optional UpsampleConfig upsample_conf = 21;
-}
-
-message LayerConfig {
-  required string name = 1;
-  required string type = 2;
-  optional uint64 size = 3;
-  // optional ActivationConfig activation = 4;
-  optional string active_type = 4;
-  repeated LayerInputConfig inputs = 5;
-  optional string bias_parameter_name = 6;
-
-  // This number must be a multiple of 16.
-  optional uint32 num_filters = 7;
-
-  // indicates that the biases of every filter in this layer
-  // should be shared amongst all applications of that filter
-  // (which is how convnets are usually trained). Setting this to
-  // false will untie the biases, yielding a separate bias for
-  // every location at which the filter is applied.
-  optional bool shared_biases = 8 [ default = false ];
-
-  // Valid values are ones that divide the area of the output
-  // grid in this convolutional layer. For example if this layer
-  // produces 32-channel 20x20 output grid, valid values of
-  // partialSum are ones which divide 20*20 = 400.
-  // I'll update this comments when confirmed
-  optional uint32 partial_sum = 9;
-
-  // for dropout
-  optional double drop_rate = 10;
-
-  // for HierarchicalSoftmaxLayer and NCELayer
-  // the number of classes
-  optional uint32 num_classes = 11;
-
-  // the gpu device which the Layer's data in.
-  // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 12 [ default = -1 ];
-
-  // for recurrent layer. If true, the recurrence runs from the end to the
-  // beginning.
-  optional bool reversed = 13 [ default = false ];
-
-  // for lstmemory layer. Different types of nodes have different activation
-  // type.
-  optional string active_gate_type = 14;
-  optional string active_state_type = 15;
-
-  // For NCELayer
-  // The number of random negative labels for each sample
-  optional int32 num_neg_samples = 16 [ default = 10 ];
-
-  // For NCELayer
-  // The distribution for generating the random negative labels.
-  // A uniform distribution will be used if not provided
-  repeated double neg_sampling_dist = 17 [ packed = true ];
-
-  // For MaxLayer
-  // default: output VALUE of MaxLayer. set this flag to true for output INDEX
-  // INDEX will be put in Argument::value as double values.
-  optional bool output_max_index = 19 [ default = false ];
-
-  /// The filed number 20 have been deprecated.
-
-  // For self-normalized estimation
-  optional double softmax_selfnorm_alpha = 21 [ default = 0.1 ];
-
-  /// The filed numbers 22 and 23 have been deprecated.
-
-  // for MDLstmLayer
-  repeated bool directions = 24;
-
-  // for CTCLayer
-  optional bool norm_by_times = 25;
-
-  // for CostLayers
-  optional double coeff = 26 [ default = 1.0 ];
-
-  // for AverageLayer
-  // can be set to: 'average', 'sum' or 'squarerootn'
-  optional string average_strategy = 27;
-
-  // for error clipping
-  optional double error_clipping_threshold = 28 [ default = 0.0 ];
-
-  // for operators used by mixed layer
-  repeated OperatorConfig operator_confs = 29;
-
-  // for lambdaCost
-  optional int32 NDCG_num = 30;
-  optional int32 max_sort_size = 31;
-
-  // for SlopeInterceptLayer
-  optional double slope = 32;
-  optional double intercept = 33;
-
-  // for CosSimVecMatLayer and CosSimLayer
-  optional double cos_scale = 34;
-
-  // for DataNormLayer
-  // can be set to: 'z-score', 'min-max' or 'decimal-scaling'
-  optional string data_norm_strategy = 36;
-
-  // for bos/eos id
-  optional uint32 bos_id = 37;
-  optional uint32 eos_id = 38;
-
-  // for max id layer
-  optional uint32 beam_size = 39;
-
-  // for seqlastins layer, whether select first instead last
-  optional bool select_first = 40 [ default = false ];
-
-  // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
-  // can be set to: 'non-seq','seq'
-  optional string trans_type = 41 [ default = 'non-seq' ];
-
-  // to indicate whether selective_fc layer
-  // is used in sequence generation or not
-  optional bool selective_fc_pass_generation = 42 [ default = false ];
-
-  // to indicate whether selective_fc layer take its last input to
-  // selected several columns and only compute the multiplications
-  // between the input matrices and the selected columns of
-  // the parameter matrices of this layer.
-  // if set false, selective_fc degrades into fc.
-  optional bool has_selected_colums = 43 [ default = true ];
-
-  // this parameter is for speed consideration.
-  // if number of the selected columns is less than
-  // sample number * selective_fc output size * selective_fc_mull_mull_ratio
-  // sparse multiplication is used, otherwise, using full multiplication.
-  optional double selective_fc_full_mul_ratio = 44 [ default = 0.02 ];
-
-  // to indicate how many threads selective_fc use to to accelate
-  // the plain_mul period
-  // leave empty or set to 0 to disable multi-thread accleleration
-  optional uint32 selective_fc_parallel_plain_mul_thread_num = 45
-      [ default = 0 ];
-
-  // for batch normalization layer
-  // if set use_global_stats true, will use the loaded mean and variance.
-  optional bool use_global_stats = 46;
-
-  // use to compute moving mean and variance.
-  optional double moving_average_fraction = 47 [ default = 0.9 ];
-
-  // bias size
-  optional uint32 bias_size = 48 [ default = 0 ];
-
-  // this parameter can be used as a user-defined parameter when necessary,
-  // without changing the proto file.
-  // e.g., when a new layer with a user-defined parameter is implemented,
-  // it can be used to pass that parameter, without modifying the proto file.
-  // string type is used for flexibility: different types can be converted
-  // to string and reinterpreted in the user's own layer implementation.
-  optional string user_arg = 49;
-
-  // to indicate rectangle image data
-  optional uint64 height = 50;
-  optional uint64 width = 51;
-
-  // blank label used in ctc loss
-  optional uint32 blank = 52 [ default = 0 ];
-
-  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
-  // controls the scope of pooling operation. can be set > 0.
-  // leave empty or set to -1 to disable this stride pooling.
-  optional int32 seq_pool_stride = 53 [ default = -1 ];
-
-  // for crop layer
-  optional int32 axis = 54 [ default = 2 ];
-  repeated uint32 offset = 55;
-  repeated uint32 shape = 56;
-
-  // for HuberRegressionLoss
-  optional double delta = 57 [ default = 1.0 ];
-
-  // for 3D data
-  optional uint64 depth = 58 [ default = 1 ];
-
-  // for switch order layer
-  optional ReshapeConfig reshape_conf = 59;
-
-  // for batch normalization layer
-  // The small constant added to the variance to improve numeric stability.
-  optional double epsilon = 60 [ default = 0.00001 ];
-
-  // for factorization machine layer
-  optional uint32 factor_size = 61;
-}
-
-message EvaluatorConfig {
-  required string name = 1;
-  required string type = 2;
-  repeated string input_layers = 3;
-
-  // Used by ChunkEvaluator
-  // one of "IOB", "IOE", "IOBES"
-  optional string chunk_scheme = 4;
-  // number of chunk types other than "other"
-  optional int32 num_chunk_types = 5;
-
-  // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
-  // For multi binary labels: true if output > classification_threshold
-  optional double classification_threshold = 6 [ default = 0.5 ];
-  // The positive label. -1 means average precision and recall
-  optional int32 positive_label = 7 [ default = -1 ];
-
-  // load dict from this file
-  optional string dict_file = 8;
-
-  // dump result in this file
-  optional string result_file = 9;
-
-  // top # results for max id printer
-  optional int32 num_results = 10 [ default = 1 ];
-
-  // whether to delimit the sequence in the seq_text_printer
-  optional bool delimited = 11 [ default = true ];
-
-  // Used by ChunkEvaluator
-  // chunk of these types are not counted
-  repeated int32 excluded_chunk_types = 12;
-
-  // Used by ClassificationErrorEvaluator
-  // top # classification error
-  optional int32 top_k = 13 [ default = 1 ];
-
-  // Used by DetectionMAPEvaluator
-  optional double overlap_threshold = 14 [ default = 0.5 ];
-
-  optional int32 background_id = 15 [ default = 0 ];
-
-  optional bool evaluate_difficult = 16 [ default = false ];
-
-  optional string ap_type = 17 [ default = "11point" ];
-}
-
-message LinkConfig {
-  required string layer_name = 1;
-  required string link_name = 2;
-  // If true, this link has sub-sequence
-  optional bool has_subseq = 3 [ default = false ];
-}
-
-message MemoryConfig {
-  required string layer_name = 1;
-  required string link_name = 2;
-
-  optional string boot_layer_name = 3;
-  optional string boot_bias_parameter_name = 4;
-  optional string boot_bias_active_type = 5;
-  optional uint32 boot_with_const_id = 7;
-
-  // memory is a sequence, initailized by a sequence boot layer
-  optional bool is_sequence = 6 [ default = false ];
-}
-
-message GeneratorConfig {
-  required uint32 max_num_frames = 1;
-  required string eos_layer_name = 2;
-  optional int32 num_results_per_sample = 3 [ default = 1 ];
-
-  // for beam search
-  optional int32 beam_size = 4 [ default = 1 ];
-
-  optional bool log_prob = 5 [ default = true ];
-}
-
-message SubModelConfig {
-  required string name = 1;
-  repeated string layer_names = 2; // selected layers in sub model
-  repeated string input_layer_names = 3;
-  repeated string output_layer_names = 4;
-  repeated string evaluator_names = 5;
-
-  optional bool is_recurrent_layer_group = 6 [ default = false ];
-
-  // If true, the recurrence runs from the end to the beginning.
-  optional bool reversed = 7 [ default = false ];
-
-  // name and link name of memory
-  repeated MemoryConfig memories = 8;
-
-  // if use recurrent layer group, all layers in submodel will postfix by
-  // "_in_"+submodel.name, so we add a name pair to link between
-  // root model and layer group,
-  // note that these in/out layers are not input/output of the network.
-  repeated LinkConfig in_links = 9;
-  repeated LinkConfig out_links = 10;
-
-  optional GeneratorConfig generator = 11;
-
-  // the id of inlink which share info with outlinks, used in recurrent layer
-  // group
-  optional int32 target_inlinkid = 12;
-}
-
-message ModelConfig {
-  // type of the model.
-  // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
-  required string type = 1 [ default = "nn" ];
-
-  // layers should be ordered in such a way that the forward propagation
-  // can be correctly executed by going from the first layer to the last layer
-  repeated LayerConfig layers = 2;
-
-  repeated ParameterConfig parameters = 3;
-
-  // Input layers should have the same order as the data streams provided
-  // by the data provider. The type of input layers should be "data"
-  repeated string input_layer_names = 4;
-
-  // For training, the type of a output layer is usually cost layer.
-  // For prediction, they should be the actual output layers.
-  repeated string output_layer_names = 5;
-
-  repeated EvaluatorConfig evaluators = 6;
-
-  repeated SubModelConfig sub_models = 8;
-
-  // For External Machine, defining how to split a neural network
-  // into multiple parts.
-  optional ExternalConfig external_config = 9;
-};
diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto
deleted file mode 100644
index e9ea1bfbc..000000000
--- a/proto/OptimizerConfig.proto
+++ /dev/null
@@ -1,164 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-syntax = "proto2";
-
-option optimize_for = LITE_RUNTIME;
-
-package paddle;
-
-message SGDConfig {
-  // SGD
-  // momentum: float >= 0. Parameter updates momentum.
-  // decay: float >= 0. Learning rate decay over each update.
-  // nesterov: boolean. Whether to apply Nesterov momentum.
-  optional double momentum = 21 [ default = 0.0 ];
-  optional double decay = 23 [ default = 0.0 ];
-  optional bool nesterov = 24 [ default = false ];
-}
-
-message AdadeltaConfig {
-  // Adadelta
-  // It is recommended to leave it at the default value.
-  // rho: float >= 0.
-  // epsilon: float >= 0. Fuzz factor.
-  // decay: float >= 0. Learning rate decay over each update.
-
-  // reference : [Adadelta - an adaptive learning rate
-  // method](http://arxiv.org/abs/1212.5701)
-  optional double rho = 33 [ default = 0.90 ];
-  optional double epsilon = 31 [ default = 1e-5 ];
-  optional double decay = 32 [ default = 0.0 ];
-}
-
-message AdagradConfig {
-  // Adagrad
-  // epsilon: float >= 0.
-  // decay: float >= 0. Learning rate decay over each update.
-
-  // reference : [Adaptive Subgradient Methods for Online Learning and
-  // Stochastic
-  // Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-  optional double epsilon = 41 [ default = 1e-5 ];
-  optional double decay = 42 [ default = 0.0 ];
-}
-
-message AdamConfig {
-  // Adaj
-  // beta_1: float, 0 < beta < 1. Generally close to 1.
-  // beta_2: float, 0 < beta < 1. Generally close to 1.
-  // epsilon: float >= 0. Fuzz factor.
-  // decay: float >= 0. Learning rate decay over each update.
-  // reference : [Adam - A Method for Stochastic
-  // Optimization](http://arxiv.org/abs/1412.6980v8)
-  optional double beta_1 = 41;
-  optional double beta_2 = 42;
-  optional double epsilon = 43;
-  optional double decay = 44;
-}
-
-message ConstLrConfig {
-  // learninRate Policy
-  optional double learning_rate = 1 [ default = 1.0 ];
-}
-
-message LinearLrConfig {
-  // learninRate Policy
-  optional double learning_rate = 1 [ default = 1.0 ];
-  optional double lr_decay_a = 2;
-  optional double lr_decay_b = 3;
-}
-
-message TensorProto {
-  enum DataType {
-    PADDLE_ELEMENT_TYPE_INT32 = 0;
-    PADDLE_ELEMENT_TYPE_UINT32 = 1;
-    PADDLE_ELEMENT_TYPE_INT64 = 2;
-    PADDLE_ELEMENT_TYPE_UINT64 = 3;
-    PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
-    PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
-  }
-  optional DataType data_type = 1;
-  repeated bytes content = 2;
-}
-
-message LrPolicyState {
-  // learninRate Policy
-  optional double learning_rate = 1 [ default = 1.0 ];
-  optional double lr_decay_a = 2;
-  optional double lr_decay_b = 3;
-}
-
-message SGDOptimizerState {
-  optional LrPolicyState lr_state = 101;
-  optional double num_sample_passed = 104;
-  // state
-  optional TensorProto parameter = 1;
-  optional TensorProto momentums = 2;
-}
-
-message AdadeltaOptimizerState {
-  // learning rate policy
-  optional LrPolicyState lr_state = 101;
-  optional double num_sample_passed = 104;
-  // state
-  optional TensorProto parameter = 1;
-  optional TensorProto accum_gradient = 2;
-  optional TensorProto accum_delta = 3;
-  optional TensorProto update_delta = 4;
-}
-
-message AdagradOptimizerState {
-  optional LrPolicyState lr_state = 101;
-  optional double num_sample_passed = 104;
-  // state
-  optional TensorProto parameter = 1;
-  optional TensorProto accum_gradient = 2;
-}
-
-message AdamOptimizerState {
-  optional LrPolicyState lr_state = 101;
-  optional double num_sample_passed = 104;
-  // state
-  optional TensorProto parameter = 1;
-  optional TensorProto momentums = 2;
-  optional TensorProto velocitys = 3;
-}
-
-message OptimizerConfig {
-  enum Optimizer {
-    SGD = 1;
-    Adadelta = 2;
-    Adagrad = 3;
-    Adam = 4;
-  }
-  optional Optimizer optimizer = 1;
-  optional SGDConfig sgd = 3;
-  optional AdadeltaConfig adadelta = 4;
-  optional AdagradConfig adagrad = 5;
-  optional AdamConfig adam = 6;
-
-  enum LrPolicy {
-    Const = 0;
-    Linear = 1;
-  }
-  optional LrPolicy lr_policy = 11;
-  optional ConstLrConfig const_lr = 12;
-  optional LinearLrConfig linear_lr = 13;
-
-  // common config of optimizer
-  // gradient clip when L2 exceeding value
-  optional double clip_norm = 101;
-  // gradient clip when L1 exceeding value
-  optional double clip_value = 102;
-}
diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto
deleted file mode 100644
index 6f8ba9d76..000000000
--- a/proto/ParameterConfig.proto
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-package paddle;
-
-/**
- * Configuration structure for parameter
- */
-
-enum ParameterInitStrategy {
-  PARAMETER_INIT_NORMAL = 0;
-  PARAMETER_INIT_UNIFORM = 1;
-}
-
-message ParameterUpdaterHookConfig {
-  // hook type such as  'pruning'
-  required string type = 1;
-  // this represents the ratio of zero element to be set by the Parameter
-  optional double sparsity_ratio = 2 [ default = 0.6 ];
-}
-
-message ParameterConfig {
-  required string name = 1;
-  required uint64 size = 2;
-  optional double learning_rate = 3 [ default = 1.0 ];
-  optional double momentum = 4 [ default = 0.0 ];
-  optional double initial_mean = 5 [ default = 0.0 ];
-  optional double initial_std = 6 [ default = 0.01 ];
-  // use L2-regularization if decay_rate set and decay_rate_l1 not set
-  optional double decay_rate = 7 [ default = 0.0 ];
-  // use L1-regularization if decay_rate_l1 set
-  optional double decay_rate_l1 = 8 [ default = 0.0 ];
-  // dims of Parameter, e.g. dims[0] as height, dims[1] as width..
-  repeated uint64 dims = 9;
-  // the gpu device which the parameter in.
-  // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 10 [ default = -1 ];
-  // how to init the parameter: 0 -> normal, 1 -> uniform
-  // 0: treat initial_mean as mean, intial_std as standard deviation
-  // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
-  optional int32 initial_strategy = 11 [ default = 0 ];
-  // define the variance when init the parameter, by height of the Matrix
-  optional bool initial_smart = 12 [ default = false ];
-  // apply regularization every # batches
-  optional int32 num_batches_regularization = 13 [ default = 1 ];
-  // if is_sparse is true, para is sparse, else para is dense
-  optional bool is_sparse = 14 [ default = false ];
-  // if para is sparse, format should be "csc" or "csr", empty means is not
-  // sparse
-  optional string format = 15 [ default = "" ];
-  // sparse remote update or not
-  optional bool sparse_remote_update = 16 [ default = false ];
-  // gradient clipping threshold, no clipping by default
-  optional double gradient_clipping_threshold = 17 [ default = 0.0 ];
-  // static parameters are fixed when training
-  optional bool is_static = 18 [ default = false ];
-  // para_id should NOT be set by config_parser. It is for
-  // internal use.
-  optional uint64 para_id = 19;
-
-  repeated ParameterUpdaterHookConfig update_hooks = 20;
-  // setup load mat -> csr
-  optional bool need_compact = 21 [ default = false ];
-  // whether to do sparse update for this parameter
-  optional bool sparse_update = 22 [ default = false ];
-
-  // whether this parameter is shared or not.
-  optional bool is_shared = 23 [ default = false ];
-  // parameter block size
-  optional uint64 parameter_block_size = 24 [ default = 0 ];
-}
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
deleted file mode 100644
index 1404c8aa1..000000000
--- a/proto/ParameterServerConfig.proto
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-package paddle;
-
-/**
- * Configuration structure for ParameterClient2.
- */
-message ParameterClientConfig { required int32 trainer_id = 1; }
-
-/**
- * Configuration structure for ParameterServer2.
- */
-message ParameterServerConfig {
-  // Number of ports for sending dense parameter,
-  // following ports on parameter server will be visited
-  // for sending dense parameter: [port, port+ports_num-1]
-  required int32 ports_num = 1 [ default = 1 ];
-  // Number of ports for sending sparse parameter,
-  // following ports on parameter server will be visited
-  // for sending sparse parameter:
-  // [port+ports_num, port+ports_num+ports_num_for_sparse-1]
-  required int32 ports_num_for_sparse = 2 [ default = 0 ];
-  // network device name for pservers
-  required string nics = 3 [ default = "xgbe0,xgbe1" ];
-  required string rdma_tcp = 4 [ default = "tcp" ];
-  // Listening port for pserver
-  required int32 port = 5 [ default = 20134 ];
-  // number of gradient servers
-  required int32 num_gradient_servers = 6 [ default = 1 ];
-  // number of threads for sync op exec
-  required int32 pserver_num_threads = 7 [ default = 1 ];
-  // control config_.async_lagged_grad_discard_ratio() min value
-  required double async_lagged_ratio_min = 8 [ default = 1.0 ];
-  // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
-  // use it as defalut value
-  required double async_lagged_ratio_default = 9 [ default = 1.5 ];
-}
diff --git a/proto/ParameterService.proto b/proto/ParameterService.proto
deleted file mode 100644
index b56c1bfe7..000000000
--- a/proto/ParameterService.proto
+++ /dev/null
@@ -1,351 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-import "ParameterConfig.proto";
-import "TrainerConfig.proto";
-
-package paddle;
-
-/**
- * Various structs for communicating with parameter server
- */
-enum ParameterUpdateMode {
-  // Set parameter
-  PSERVER_UPDATE_MODE_SET_PARAM = 0;      // use local param
-  PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1; // set zero param
-
-  // Update parameter once a gradient is received
-  PSERVER_UPDATE_MODE_ASYNC_SGD = 2;
-
-  // Accumulate gradient
-  PSERVER_UPDATE_MODE_ADD_GRADIENT = 3;
-
-  // Average parameters
-  PSERVER_UPDATE_MODE_AVERAGE_PARAMETER = 4;
-
-  // No update. Only get parameters back.
-  PSERVER_UPDATE_MODE_GET_PARAM = 5;
-  PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6; // only get sparse rows
-};
-
-message ParameterBlock {
-  // it accurately means parameter id.
-  required uint64 para_id = 1;
-  // global sparse row or dense block for each block in parameter
-  required uint64 block_id = 2;
-  // offset in (local) storage
-  required uint64 begin_pos = 3;
-  // actual size of block, size for last block is [endDim -beginDim],
-  // others is parameter_block_size in ParameterConfig
-  required uint64 block_size = 4;
-}
-
-enum PServerStatus {
-  PSERVER_STATUS_NOT_SET = 0;
-  PSERVER_STATUS_PARAMETER_READY = 1;
-};
-
-enum BatchStatus {
-  BATCH_START = 0;
-  BATCH_ON = 1;
-  BATCH_FINISH = 2;
-  BATCH_START_AND_FINISH = 3;
-};
-
-message SendParameterRequest {
-  required ParameterUpdateMode update_mode = 1;
-  repeated ParameterBlock blocks = 2;
-  required bool send_back_parameter = 3;
-
-  // number of samples used for calculating this update
-  optional int64 num_samples = 4;
-
-  // cost will be used to calculate global objective value
-  optional double cost = 5;
-
-  required BatchStatus batch_status = 6;
-
-  optional int32 trainer_id = 7;
-
-  // send back parameter type on pserver, PARAMETER_VALUE by default
-  optional int32 send_back_parameter_type = 8 [ default = 0 ];
-
-  // forwardbackward time in usec
-  optional uint64 forwardbackward_time = 9;
-}
-
-message WaitPassStartRequest {}
-
-message WaitPassStartResponse {}
-
-message WaitPassFinishRequest {}
-
-message WaitPassFinishResponse {}
-
-enum SyncObject {
-  SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_
-  SYNC_DATA = 1;    // wait for the synchronizeDataBarrier_
-}
-
-message SynchronizeRequest {
-  required SyncObject sync_object_id = 1 [ default = SYNC_DEFAULT ];
-
-  optional int32 trainer_id = 2;
-}
-
-message SynchronizeResponse {}
-
-message SendParameterResponse { repeated ParameterBlock blocks = 1; }
-
-message SetConfigRequest {
-  repeated ParameterConfig param_configs = 1;
-  required OptimizationConfig opt_config = 2;
-  required string save_dir = 4;
-  required int32 server_id = 5;
-  required bool is_sparse_server = 6;
-}
-
-message SetConfigResponse {}
-
-message GetStatusRequest {}
-
-message GetStatusResponse { required PServerStatus status = 1; }
-
-message SetStatusRequest { required PServerStatus status = 1; }
-
-message SetStatusResponse {}
-
-// create a column vector. The size is the dimension of parameter
-message CreateVectorRequest {}
-
-message CreateVectorResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-
-  required int64 handle = 2;
-}
-
-message ReleaseVectorRequest { required int64 handle = 1; }
-
-message ReleaseVectorResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-}
-
-// Create a column major matrix. The number of rows is the dimension
-// of parameter. The number of columns is specifed by num_cols
-message CreateMatrixRequest { required int32 num_cols = 1; }
-
-message CreateMatrixResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-
-  required int64 handle = 2;
-}
-
-message ReleaseMatrixRequest { required int64 handle = 1; }
-
-message ReleaseMatrixResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-}
-
-/**
- * The operations are defined using the variables commented at Operation
- * and OperationResult
- */
-enum MatrixVectorOperation {
-  // r = u^T u
-  PSERVER_OP_utu = 0;
-
-  // r = u^T v
-  PSERVER_OP_utv = 1;
-
-  // u = a u
-  PSERVER_OP_au = 2;
-
-  // v = a u + b v
-  PSERVER_OP_au_bv = 3;
-
-  // u = a A x + b u
-  PSERVER_OP_aAx_bu = 4;
-
-  // Stochastic gradient update
-  PSERVER_OP_SGD = 5;
-
-  // u = a
-  PSERVER_OP_RESET = 6;
-
-  // v = u
-  PSERVER_OP_COPY = 7;
-
-  // w = a u + b v + c w
-  PSERVER_OP_au_bv_cw = 8;
-
-  // owlqn: MakeSteepestDescDir
-  PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
-
-  // owlqn: FixDirSigns
-  PSERVER_OP_FIX_DIR_SIGNS = 10;
-
-  // owlqn: DirDeriv
-  PSERVER_OP_DIR_DERIV = 11;
-
-  // owlqn: FixOmegaSigns
-  PSERVER_OP_FIX_OMEGA_SIGNS = 12;
-
-  // Get overall cost
-  PSERVER_OP_COST = 13;
-
-  // Pass control
-  PSERVER_OP_START_PASS = 14;
-  PSERVER_OP_FINISH_PASS = 15;
-
-  // randomize value
-  PSERVER_OP_RANDOMIZE = 16;
-
-  // call optimizer apply
-  PSERVER_OP_APPLY = 17;
-}
-
-message ProtoVector {
-  required int64 dim = 1;
-  repeated double values = 2 [ packed = true ];
-}
-
-message ProtoMatrix {
-  required int64 num_rows = 1;
-  required int64 num_cols = 2;
-  repeated double values = 3 [ packed = true ];
-}
-
-message Operation {
-  required MatrixVectorOperation operation = 1;
-
-  // vector handles created on the pserver
-  repeated int64 pvectors = 2; // u, v, w
-
-  // matrix handles created on the pserver
-  repeated int64 pmatrices = 3; // A, B, C
-
-  repeated double scalars = 4;       // a, b, c
-  repeated ProtoVector vectors = 5;  // x, y, z
-  repeated ProtoMatrix matrices = 6; // X, Y, Z
-}
-
-message OperationResult {
-  // error message. Empty if success
-  optional string return_message = 1;
-  //
-  repeated double scalars = 2;       // d, e, f
-  repeated ProtoVector vectors = 3;  // p, q, r
-  repeated ProtoMatrix matrices = 4; // P, Q, R
-}
-
-message DoOperationRequest {
-  repeated Operation operations = 1;
-
-  // If true, wait for gradient to be ready before starting the operations
-  required bool wait_for_gradient = 2;
-
-  // If true, send back the parameter to clients after the operations are
-  // finished
-  required bool send_back_parameter = 3;
-
-  // If true, and if all clients call waitPassFinish,
-  // signal all clients finish the pass
-  required bool release_pass = 4;
-}
-
-message DoOperationResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-
-  repeated OperationResult results = 2;
-
-  required bool pass_finish = 3;
-}
-
-message LoadValueRequest { required string dir_name = 1; }
-
-message LoadValueResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-}
-
-message SaveValueRequest { required string dir_name = 1; }
-
-message SaveValueResponse {
-  // error message. Empty if success
-  optional string return_message = 1;
-}
-
-enum DataUpdateMode {
-  // Client send it's own data to pserver
-  DATA_UPDATE_MODE_SET_OWN = 0;
-  // Client get all user data from all pservers
-  DATA_UPDATE_MODE_GET_ALL = 1;
-  // Client send it's own ref feature to pserver
-  DATA_UPDATE_MODE_SET_REF = 2;
-  // Client get all ref featuers from all pservers
-  DATA_UPDATE_MODE_GET_REF = 3;
-  // Client send it's own ref label to pserver
-  DATA_UPDATE_MODE_SET_REF_LABEL = 4;
-  // Client get all ref labels from all pservers
-  DATA_UPDATE_MODE_GET_REF_LABEL = 5;
-  // Client send it's own ref grad to pserver
-  DATA_UPDATE_MODE_SET_REF_GRAD = 6;
-  // Client get all ref grad from all pservers
-  DATA_UPDATE_MODE_GET_REF_GRAD = 7;
-}
-
-enum SendDataType {
-  DATA_REF = 0;
-  DATA_REFLABEL = 1;
-  DATA_REFGRAD = 2;
-  DATA_REDUCE_SUM = 3;
-}
-
-enum TransDataType {
-  TRANS_INT32 = 0;
-  TRANS_UINT32_T = 1;
-  TRANS_INT64_T = 2;
-  TRANS_UINT64_T = 3;
-  TRANS_FLOAT = 5;
-  TRANS_DOUBLE = 6;
-}
-
-message DataBlock {
-  // total byte size of this data blcok
-  required uint64 total_size = 1;
-  // byte size of one data type
-  required int32 data_size = 2;
-  // data_type
-  optional TransDataType data_type = 3 [ default = TRANS_DOUBLE ];
-}
-
-message SendDataRequest {
-  required SendDataType type = 1;
-  required DataUpdateMode update_mode = 2;
-  repeated DataBlock blocks = 3;
-  required uint64 client_id = 4;
-  required uint64 server_id = 5;
-}
-
-message SendDataResponse {
-  required SendDataType type = 1;
-  repeated DataBlock blocks = 2;
-  required uint64 server_id = 3;
-}
diff --git a/proto/README.md b/proto/README.md
deleted file mode 100644
index dda7ed7b3..000000000
--- a/proto/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-## protos in this folder are legacy v2 protos.
-
-## Please refer to paddle/fluid for latest version.
diff --git a/proto/TrainerConfig.proto b/proto/TrainerConfig.proto
deleted file mode 100644
index 9cc20b4a3..000000000
--- a/proto/TrainerConfig.proto
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-syntax = "proto2";
-
-import "DataConfig.proto";
-import "ModelConfig.proto";
-
-package paddle;
-
-message OptimizationConfig {
-  optional int32 batch_size = 3 [ default = 1 ];
-  required string algorithm = 4 [ default = "async_sgd" ];
-  optional int32 num_batches_per_send_parameter = 5 [ default = 1 ];
-  optional int32 num_batches_per_get_parameter = 6 [ default = 1 ];
-
-  required double learning_rate = 7;
-  optional double learning_rate_decay_a = 8 [ default = 0 ];
-  optional double learning_rate_decay_b = 9 [ default = 0 ];
-  optional string learning_rate_schedule = 27 [ default = "constant" ];
-  // learning rate will be scaled according to learning_rate_schedule
-  // 1), constant:
-  // lr = learning_rate
-  // 2), poly:
-  // lr = learning_rate *
-  //      pow(1 + learning_rate_decay_a * num_samples_processed,
-  //          -learning_rate_decay_b)
-  // 3), exp:
-  // lr = learning_rate *
-  //      pow(learning_rate_decay_a,
-  //          num_samples_processed / learning_rate_decay_b)
-  // 4), discexp:
-  // lr = learning_rate *
-  //      pow(learning_rate_decay_a,
-  //          floor(num_samples_processed / learning_rate_decay_b))
-  // 5), linear:
-  // lr = max(learning_rate - learning_rate_decay_a * num_samples_processed,
-  //          learning_rate_decay_b)
-
-  // owlqn related
-  // L1-regularization
-  optional double l1weight = 10 [ default = 0.1 ];
-  // L2-regularization
-  optional double l2weight = 11 [ default = 0 ];
-  // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
-  // then accept the step
-  optional double c1 = 12 [ default = 0.0001 ];
-  // multiply the step with "backoff", when wolfe condition doesn't satisfy
-  optional double backoff = 13 [ default = 0.5 ];
-  // how many "s"s and "y"s are kept in owlqn
-  optional int32 owlqn_steps = 14 [ default = 10 ];
-  // accept the step if encountered "max_backoff" times of "reduce the step"
-  optional int32 max_backoff = 15 [ default = 5 ];
-  // L2-regularization coefficient is reduced linearly from iteration 0 to
-  // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
-  // iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
-  optional int32 l2weight_zero_iter = 17 [ default = 0 ];
-
-  // averaged sgd
-  // About average_window * numBatchProcessed parameter are used
-  // for average. To be accurate, between average_window * numBatchProcessed
-  // and 2 * average_window * numBatchProcessed parameters are used for
-  // average.
-  optional double average_window = 18 [ default = 0 ];
-  optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ];
-
-  //////////////////////////
-  // Options Adaptive SGD //
-  //////////////////////////
-
-  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta",
-  // "rmsprop"
-  // default learning method("momentum") use global decayed learning rate with
-  // momentum.
-  // "adagrad", "adadelta" and "rmsprop" can set momentum too.
-  optional string learning_method = 23 [ default = "momentum" ];
-  optional double ada_epsilon = 24 [ default = 1e-6 ];
-  optional double ada_rou = 26 [ default = 0.95 ];
-
-  // Force to do average in cpu in order to save gpu memory usage
-  optional bool do_average_in_cpu = 25 [ default = false ];
-
-  // delta add rate in pserver, used while num_batches_per_send_parameter>1
-  // will be divided by #machines automatically.
-  optional double delta_add_rate = 28 [ default = 1.0 ];
-
-  // We split a large size into smaller mini-batches, whose sizes are
-  // determined by mini_batch_size. It only takes effect when there is
-  // an ExternalMachine.
-  optional int32 mini_batch_size = 29 [ default = 128 ];
-
-  // automatically set if any one of parameters set sparse remote update flag
-  optional bool use_sparse_remote_updater = 30 [ default = false ];
-
-  // how to update center parameter and feedback to local parameter,
-  // when use local sgd update in cluster training.
-  // A option is elastic_average, proposed by the paper: Deep learning with
-  // elastic averaging SGD.
-  // If use elastic_average method, every trainer node should sample from whole
-  // data sets.
-  optional string center_parameter_update_method = 31 [ default = "average" ];
-
-  // shrink sparse parameter value
-  // only works if parameter is remote sparse update and has L1 decay rate
-  optional double shrink_parameter_value = 32 [ default = 0 ];
-
-  ////////////////////////////
-  // Options Adam Optimizer //
-  ////////////////////////////
-  optional double adam_beta1 = 33 [ default = 0.9 ];
-  optional double adam_beta2 = 34 [ default = 0.999 ];
-  optional double adam_epsilon = 35 [ default = 1e-8 ];
-
-  // arguments for learning rate scheduler
-  // Format: num1:rate1,num2:rate2,...,numK:rateK
-  // For learning_rate_schedule="manual", num is the number of samples,
-  // For learning_rate_schedule="pass_manual",
-  //  num is the number of passes (starting from 0)
-  optional string learning_rate_args = 36 [ default = "" ];
-
-  // for async sgd gradient commit control.
-  // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
-  // current async gradient will be discard silently.
-  optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ];
-
-  // global threshold for gradient clipping
-  optional double gradient_clipping_threshold = 38 [ default = 0.0 ];
-};
-
-message TrainerConfig {
-  optional ModelConfig model_config = 1;
-  optional DataConfig data_config = 2;
-  required OptimizationConfig opt_config = 3;
-  optional DataConfig test_data_config = 4;
-  repeated string config_files = 5;
-
-  // the directory to save/load model files for each training path
-  optional string save_dir = 6 [ default = "./output/model" ];
-
-  // Path of the initial model parameters.
-  // If it was set, start_pass will be ignored.
-  optional string init_model_path = 7;
-
-  // Start training from this pass.
-  // Will load parameter from the previous pass.
-  optional int32 start_pass = 8 [ default = 0 ];
-
-  // file path to the trainer config file
-  optional string config_file = 9;
-}
-- 
GitLab


From eec133ca6a9545e5a05bfa7b8eced8a6a69582c4 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:48:41 +0800
Subject: [PATCH 4/9] remove legacy testing code

---
 paddle/testing/TestMain.cpp |  22 ----
 paddle/testing/TestUtil.cpp | 222 ------------------------------------
 paddle/testing/TestUtil.h   |  78 -------------
 3 files changed, 322 deletions(-)
 delete mode 100644 paddle/testing/TestMain.cpp
 delete mode 100644 paddle/testing/TestUtil.cpp
 delete mode 100644 paddle/testing/TestUtil.h

diff --git a/paddle/testing/TestMain.cpp b/paddle/testing/TestMain.cpp
deleted file mode 100644
index 1811dbbd1..000000000
--- a/paddle/testing/TestMain.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/utils/Util.h"
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
deleted file mode 100644
index fa8efc20f..000000000
--- a/paddle/testing/TestUtil.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TestUtil.h"
-#include <gflags/gflags.h>
-#include "paddle/legacy/math/SparseMatrix.h"
-
-DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
-
-namespace paddle {
-
-std::string randStr(const int len) {
-  std::string str =
-      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-  std::string s = "";
-  for (int i = 0; i < len; ++i) s += str[(rand() % 62)];  // NOLINT
-  return s;
-}
-
-MatrixPtr makeRandomSparseMatrix(size_t height,
-                                 size_t width,
-                                 bool withValue,
-                                 bool useGpu,
-                                 bool equalNnzPerSample) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  std::vector<int64_t> ids(height);
-  std::vector<int64_t> indices(height + 1);
-  indices[0] = 0;
-
-  std::function<size_t()> randomer = [] { return uniformRandom(10); };
-  if (equalNnzPerSample) {
-    size_t n = 0;
-    do {
-      n = uniformRandom(10);
-    } while (!n);
-    randomer = [=] { return n; };
-  }
-  for (size_t i = 0; i < height; ++i) {
-    indices[i + 1] = indices[i] + std::min(randomer(), width);
-    ids[i] = i;
-  }
-
-  if (!withValue) {
-    std::vector<sparse_non_value_t> data;
-    data.resize(indices[height] - indices[0]);
-    for (size_t i = 0; i < data.size(); ++i) {
-      data[i].col = uniformRandom(width);
-    }
-    auto mat = Matrix::createSparseMatrix(
-        height, width, data.size(), NO_VALUE, SPARSE_CSR, false, useGpu);
-    if (useGpu) {
-      std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-          ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
-    } else {
-      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-          ids.data(), indices.data(), data.data());
-    }
-    return mat;
-  } else {
-    std::vector<sparse_float_value_t> data;
-    data.resize(indices[height] - indices[0]);
-    for (size_t i = 0; i < data.size(); ++i) {
-      data[i].col = uniformRandom(width);
-      data[i].value = rand() / static_cast<float>(RAND_MAX);  // NOLINT
-    }
-    auto mat = Matrix::createSparseMatrix(
-        height, width, data.size(), FLOAT_VALUE, SPARSE_CSR, false, useGpu);
-    if (useGpu) {
-      std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-          ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
-    } else {
-      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-          ids.data(), indices.data(), data.data());
-    }
-    return mat;
-  }
-#endif
-  return nullptr;
-}
-
-void generateSequenceStartPositions(size_t batchSize,
-                                    IVectorPtr& sequenceStartPositions) {
-  ICpuGpuVectorPtr gpuCpuVec;
-  generateSequenceStartPositions(batchSize, gpuCpuVec);
-  sequenceStartPositions = gpuCpuVec->getMutableVector(false);
-}
-
-void generateSequenceStartPositions(size_t batchSize,
-                                    ICpuGpuVectorPtr& sequenceStartPositions) {
-  int numSeqs;
-  if (FLAGS_fixed_seq_length != 0) {
-    numSeqs = std::ceil((float)batchSize / (float)FLAGS_fixed_seq_length);
-  } else {
-    numSeqs = batchSize / 10 + 1;
-  }
-  sequenceStartPositions =
-      ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-  int* buf = sequenceStartPositions->getMutableData(false);
-  int64_t pos = 0;
-  int len = FLAGS_fixed_seq_length;
-  int maxLen = 2 * batchSize / numSeqs;
-  for (int i = 0; i < numSeqs; ++i) {
-    if (FLAGS_fixed_seq_length == 0) {
-      len = uniformRandom(
-                std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
-            1;
-    }
-    buf[i] = pos;
-    pos += len;
-    VLOG(1) << " len=" << len;
-  }
-  buf[numSeqs] = batchSize;
-}
-
-void generateSubSequenceStartPositions(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  int numSeqs = sequenceStartPositions->getSize() - 1;
-  const int* buf = sequenceStartPositions->getData(false);
-  int numOnes = 0;
-  for (int i = 0; i < numSeqs; ++i) {
-    if (buf[i + 1] - buf[i] == 1) {
-      ++numOnes;
-    }
-  }
-  // each seq has two sub-seq except length 1
-  int numSubSeqs = numSeqs * 2 - numOnes;
-  subSequenceStartPositions =
-      ICpuGpuVector::create(numSubSeqs + 1, /* useGpu= */ false);
-  int* subBuf = subSequenceStartPositions->getMutableData(false);
-  int j = 0;
-  for (int i = 0; i < numSeqs; ++i) {
-    if (buf[i + 1] - buf[i] == 1) {
-      subBuf[j++] = buf[i];
-    } else {
-      int len = uniformRandom(buf[i + 1] - buf[i] - 1) + 1;
-      subBuf[j++] = buf[i];
-      subBuf[j++] = buf[i] + len;
-    }
-  }
-  subBuf[j] = buf[numSeqs];
-}
-
-void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
-                              IVectorPtr& cpuSequenceDims) {
-  /* generate sequences with 2 dims */
-  int numSeqs = sequenceStartPositions->getSize() - 1;
-  int numDims = 2;
-
-  cpuSequenceDims = IVector::create(numSeqs * numDims, /* useGpu= */ false);
-  int* bufStarts = sequenceStartPositions->getData();
-  int* bufDims = cpuSequenceDims->getData();
-
-  for (int i = 0; i < numSeqs; i++) {
-    int len = bufStarts[i + 1] - bufStarts[i];
-    /* get width and height randomly */
-    std::vector<int> dimVec;
-    for (int j = 0; j < len; j++) {
-      if (len % (j + 1) == 0) {
-        dimVec.push_back(1);
-      }
-    }
-    int idx = rand() % dimVec.size();  // NOLINT use rand_r
-    bufDims[i * numDims] = dimVec[idx];
-    bufDims[i * numDims + 1] = len / dimVec[idx];
-  }
-}
-
-void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
-                              IVectorPtr& cpuSequenceDims) {
-  /* generate sequences with 2 dims */
-  int numSeqs = sequenceStartPositions->getSize() - 1;
-  int numDims = 2;
-
-  cpuSequenceDims = IVector::create(numSeqs * numDims, /* useGpu= */ false);
-  const int* bufStarts = sequenceStartPositions->getData(false);
-  int* bufDims = cpuSequenceDims->getData();
-
-  for (int i = 0; i < numSeqs; i++) {
-    int len = bufStarts[i + 1] - bufStarts[i];
-    /* get width and height randomly */
-    std::vector<int> dimVec;
-    for (int j = 0; j < len; j++) {
-      if (len % (j + 1) == 0) {
-        dimVec.push_back(1);
-      }
-    }
-    int idx = rand() % dimVec.size();  // NOLINT use rand_r
-    bufDims[i * numDims] = dimVec[idx];
-    bufDims[i * numDims + 1] = len / dimVec[idx];
-  }
-}
-
-void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
-  EXPECT_EQ(a->getWidth(), b->getWidth());
-  EXPECT_EQ(a->getHeight(), b->getHeight());
-  EXPECT_EQ(a->isTransposed(), b->isTransposed());
-  for (size_t r = 0; r < a->getHeight(); ++r) {
-    for (size_t c = 0; c < a->getWidth(); ++c) {
-      EXPECT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
-    }
-  }
-}
-
-void checkVectorEqual(const IVectorPtr& a, const IVectorPtr& b) {
-  EXPECT_EQ(a->getSize(), b->getSize());
-  for (size_t r = 0; r < a->getSize(); ++r) {
-    EXPECT_FLOAT_EQ(a->get(r), b->get(r));
-  }
-}
-}  // namespace paddle
diff --git a/paddle/testing/TestUtil.h b/paddle/testing/TestUtil.h
deleted file mode 100644
index 98b864e3c..000000000
--- a/paddle/testing/TestUtil.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <gtest/gtest.h>
-#include "paddle/legacy/math/Matrix.h"
-
-namespace paddle {
-
-std::string randStr(const int len);
-
-inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
-
-inline bool approximatelyEqual(float a, float b, float epsilon) {
-  return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
-}
-
-MatrixPtr makeRandomSparseMatrix(size_t height,
-                                 size_t width,
-                                 bool withValue,
-                                 bool useGpu,
-                                 bool equalNnzPerSample = false);
-
-/**
- * @brief generate sequenceStartPositions for INPUT_SEQUENCE_DATA,
- *        INPUT_HASSUB_SEQUENCE_DATA and INPUT_SEQUENCE_LABEL
- *
- * @param batchSize                      batchSize
- *        sequenceStartPositions[out] generation output
- */
-void generateSequenceStartPositions(size_t batchSize,
-                                    IVectorPtr& sequenceStartPositions);
-
-void generateSequenceStartPositions(size_t batchSize,
-                                    ICpuGpuVectorPtr& sequenceStartPositions);
-
-/**
- * @brief generate subSequenceStartPositions for INPUT_HASSUB_SEQUENCE_DATA
- *        according to sequenceStartPositions
- *
- * @param sequenceStartPositions[in]     input
- *        subSequenceStartPositions[out] generation output
- */
-void generateSubSequenceStartPositions(const IVectorPtr& sequenceStartPositions,
-                                       IVectorPtr& subSequenceStartPositions);
-
-void generateSubSequenceStartPositions(
-    const ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions);
-
-/**
- * @brief generate cpuSequenceDims for INPUT_SEQUENCE_MDIM_DATA according to
- *        sequenceStartPositions
- *
- * @param sequenceStartPositions[in]     input
- *        cpuSequenceDims[out]              generation output
- */
-void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions,
-                              IVectorPtr& cpuSequenceDims);
-void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions,
-                              IVectorPtr& cpuSequenceDims);
-
-void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b);
-
-void checkVectorEqual(const IVectorPtr& a, const IVectorPtr& b);
-}  // namespace paddle
-- 
GitLab


From 5316c647766b19605a87e2eb98dba8ff6df2aadb Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 14:59:15 +0800
Subject: [PATCH 5/9] remove legacy cluster_train code

---
 paddle/scripts/cluster_train/conf.py          | 37 ---------
 paddle/scripts/cluster_train/paddle.py        | 82 -------------------
 paddle/scripts/cluster_train/run.sh           | 27 ------
 .../scripts/cluster_train_v2/fabric/conf.py   | 39 ---------
 .../fabric/docker_cluster/Dockerfile          | 11 ---
 .../fabric/docker_cluster/ssh_servers.yaml    | 23 ------
 paddle/scripts/cluster_train_v2/fabric/run.sh | 14 ----
 .../openmpi/docker_cluster/Dockerfile         | 43 ----------
 .../openmpi/docker_cluster/head.yaml          | 25 ------
 .../openmpi/docker_cluster/mpi-nodes.yaml     | 26 ------
 .../openmpi/docker_cluster/ssh/config         |  1 -
 .../openmpi/docker_cluster/ssh/id_rsa.mpi     | 27 ------
 .../openmpi/docker_cluster/ssh/id_rsa.mpi.pub |  1 -
 .../openmpi/start_mpi_train.sh                | 32 --------
 14 files changed, 388 deletions(-)
 delete mode 100644 paddle/scripts/cluster_train/conf.py
 delete mode 100644 paddle/scripts/cluster_train/paddle.py
 delete mode 100644 paddle/scripts/cluster_train/run.sh
 delete mode 100644 paddle/scripts/cluster_train_v2/fabric/conf.py
 delete mode 100644 paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
 delete mode 100644 paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
 delete mode 100644 paddle/scripts/cluster_train_v2/fabric/run.sh
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
 delete mode 100644 paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh

diff --git a/paddle/scripts/cluster_train/conf.py b/paddle/scripts/cluster_train/conf.py
deleted file mode 100644
index c77d7584d..000000000
--- a/paddle/scripts/cluster_train/conf.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-HOSTS = [
-    "root@192.168.100.17",
-    "root@192.168.100.18",
-]
-'''
-workspace configuration
-'''
-#root dir for workspace, can be set as any director with real user account
-ROOT_DIR = "/home/paddle"
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
diff --git a/paddle/scripts/cluster_train/paddle.py b/paddle/scripts/cluster_train/paddle.py
deleted file mode 100644
index ba313ac6a..000000000
--- a/paddle/scripts/cluster_train/paddle.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" module for launching cluster job """
-
-import os
-import argparse
-import socket
-import copy
-import time
-import signal
-
-from fabric.api import run, put, settings, env, prefix
-from fabric.tasks import execute
-
-#configuration for cluster
-import conf
-
-
-def refine_unknown_args(cmd_args):
-    '''
-    refine unknown parameters to handle some special parameters
-    '''
-    new_args = []
-    for arg in cmd_args:
-        if arg.startswith("--") and arg.find("=") != -1:
-            equal_pos = arg.find("=")  #find first = pos
-            arglist = list(arg)
-            arglist[equal_pos] = " "
-            arg = "".join(arglist)
-            arg = arg.lstrip("-")
-            new_args += arg.split(" ")
-        elif arg.startswith("--") and arg.find("=") == -1:
-            arg = arg.lstrip("-")
-            new_args.append(arg)
-        else:
-            new_args.append(arg)
-    return new_args
-
-
-def kill_process():
-    '''
-    kill comments threads
-    '''
-    run("ps aux \
-         | grep paddle_process_by_paddle \
-         | grep -v grep  \
-         | awk '{print $2}' \
-         | xargs kill > /dev/null 2>&1")
-
-
-def job_prepare(jobdir, data=None):
-    '''
-    prepare job related workspace data
-
-    Assuming you already installed PaddlePaddle in all nodes which means
-    PaddlePaddle related bins and dependencies libraries.
-    Assuming the train/test data have already been installed.
-    This function just prepare all related model and other resources
-    needed at runtime.
-    '''
-
-    def job_create_workspace(jobdir, data=None):
-        '''
-        prepare job workspace, common file, etc.
-        '''
-        log = os.path.join(jobdir, "log")
-        if data is not None:
-            #create job dir
-            run('rm ' + jobdir + ' -fr && ' + 'mkdir -p ' + jobdir)
-            #push data and paddle bin
diff --git a/paddle/scripts/cluster_train/run.sh b/paddle/scripts/cluster_train/run.sh
deleted file mode 100644
index 331c64988..000000000
--- a/paddle/scripts/cluster_train/run.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/sh
-
-#python paddle.py \
-#  --job_workspace="${PATH_TO_REMOTE_EXISTED_WORKSPACE}" \
-#  --dot_period=10 \
-#  --ports_num_for_sparse=2 \
-#  --log_period=50 \
-#  --num_passes=10 \
-#  --trainer_count=4 \
-#  --saving_period=1 \
-#  --local=0 \
-#  --config=./trainer_config.py \
-#  --save_dir=./output \
-#  --use_gpu=0
-
-python paddle.py \
-  --job_dispatch_package="${PATH_TO_LOCAL_WORKSPACE}" \
-  --dot_period=10 \
-  --ports_num_for_sparse=2 \
-  --log_period=50 \
-  --num_passes=10 \
-  --trainer_count=4 \
-  --saving_period=1 \
-  --local=0 \
-  --config=./trainer_config.py \
-  --save_dir=./output \
-  --use_gpu=0
diff --git a/paddle/scripts/cluster_train_v2/fabric/conf.py b/paddle/scripts/cluster_train_v2/fabric/conf.py
deleted file mode 100644
index e96503d09..000000000
--- a/paddle/scripts/cluster_train_v2/fabric/conf.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-HOSTS = [
-    "root@10.1.9.7",
-    "root@10.1.18.7",
-    "root@10.1.32.9",
-]
-'''
-workspace configuration
-'''
-#root dir for workspace, can be set as any director with real user account
-ROOT_DIR = "/root"
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 1
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 1
-#trainer whether use gpu
-PADDLE_USE_GPU = "False"
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
deleted file mode 100644
index 6606c0126..000000000
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
-RUN apt-get update && apt-get install -y openssh-server
-RUN mkdir /var/run/sshd
-
-RUN echo 'root:root' |chpasswd
-
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-
-EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
deleted file mode 100644
index 0784b2d1b..000000000
--- a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: Deployment
-metadata:
-  name: ssh-servers
-spec:
-  replicas: 3
-  template:
-    metadata:
-      labels:
-        app: ssh-servers
-    spec:
-      containers:
-      - name: ssh-servers
-        image: docker.paddlepaddlehub.com/paddlessh
-        resources:
-          limits:
-            cpu: 500m
-            memory: 1Gi
-          requests:
-            cpu: 500m
-            memory: 1Gi
-        ports:
-        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/fabric/run.sh b/paddle/scripts/cluster_train_v2/fabric/run.sh
deleted file mode 100644
index f6324bcb1..000000000
--- a/paddle/scripts/cluster_train_v2/fabric/run.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-python paddle.py \
-  --job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
-  --dot_period=10 \
-  --ports_num_for_sparse=1 \
-  --log_period=50 \
-  --num_passes=5 \
-  --trainer_count=2 \
-  --saving_period=1 \
-  --local=0 \
-  --config=./trainer_config.py \
-  --save_dir=./output \
-  --use_gpu=0
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
deleted file mode 100644
index c2f631bdf..000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-# Build this image:  docker build -t mpi .
-#
-
-FROM paddlepaddle/paddle:0.10.0rc3
-
-ENV DEBIAN_FRONTEND noninteractive
-
-RUN apt-get update -y && \
-    apt-get upgrade -y && \
-    apt-get install -y openssh-server zip unzip vim sudo \
-gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \
-pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \
-mkdir /var/run/sshd && \
-echo 'root:tutorial' | chpasswd && \
-sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
-# SSH login fix. Otherwise user is kicked off after login
-sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
-echo "export VISIBLE=now" >> /etc/profile && \
-adduser --disabled-password --gecos "" tutorial && \
-echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
-mkdir /home/tutorial/.ssh/
-
-ENV HOME /home/tutorial
-ENV NOTVISIBLE "in users profile"
-
-# ------------------------------------------------------------
-# Set-Up SSH with our Github deploy key
-# ------------------------------------------------------------
-
-ADD ssh/config /home/tutorial/.ssh/config
-ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
-ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
-ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
-
-#---------------------------------------------------------------
-#LD_LIBRARY_PATH
-#---------------------------------------------------------------
-
-RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/
-
-WORKDIR /home/tutorial
-EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
deleted file mode 100644
index 34835e5eb..000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: Deployment
-metadata:
-  name: mpi-header
-  labels:
-    app: mpi-header
-spec:
-  replicas: 1
-  template:
-    metadata:
-      labels:
-        app: mpi-header
-    spec:
-      containers:
-      - image: typhoon1986/paddle-openmpi
-        name : mpi-header
-        resources:
-          limits:
-            cpu: 500m
-            memory: 2Gi
-          requests:
-            cpu: 500m
-            memory: 2Gi
-        ports:
-        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
deleted file mode 100644
index 2fd5cb4d4..000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: Deployment
-metadata:
-  name: mpi-nodes
-  labels:
-    app: mpi-nodes
-spec:
-  replicas: 3
-  template:
-    metadata:
-      labels:
-        app: mpi-nodes
-    spec:
-      containers:
-      - image: typhoon1986/paddle-openmpi
-        name : mpi-nodes
-        resources:
-          limits:
-            cpu: 500m
-            memory: 2Gi
-          requests:
-            cpu: 500m
-            memory: 2Gi
-        ports:
-        - containerPort: 22
-        imagePullPolicy: Always
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
deleted file mode 100644
index a9ecad07c..000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
+++ /dev/null
@@ -1 +0,0 @@
-StrictHostKeyChecking no
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
deleted file mode 100644
index 23768343e..000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
+++ /dev/null
@@ -1,27 +0,0 @@
------BEGIN RSA PRIVATE KEY-----
-MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
-1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
-O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
-36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
-mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
-bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
-OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
-TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
-79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
-YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
-mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
-lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
-rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
-DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
-44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
-fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
-cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
-g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
-yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
-PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
-v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
-hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
-sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
-zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
-yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
------END RSA PRIVATE KEY-----
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
deleted file mode 100644
index 015f2b42e..000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
+++ /dev/null
@@ -1 +0,0 @@
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
diff --git a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
deleted file mode 100644
index 2a7f46362..000000000
--- a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# General trainning configurations
-
-NICS=eth0
-PADDLE_INIT_PORT=7164
-PADDLE_INIT_PORTS_NUM=1
-PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
-PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
-PADDLE_INIT_USE_GPU=False
-
-PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
-PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
-PADDLE_CLUSTER_TRAIN=True
-
-env
-
-# start pserver
-stdbuf -oL nohup paddle pserver \
-  --port=$PADDLE_INIT_PORT \
-  --ports_num=$PADDLE_INIT_PORTS_NUM \
-  --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
-  --nics=$NICS \
-  --comment=paddle_cluster_pserver \
-  --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS \
-  &> logs/pserver.log &
-
-# start trainer
-# NOTE: train.py will use the above environment variables as configuration
-python train.py &> logs/train.log
-
-# kill background pservers when train finishes
-ps -ef | grep pserver | awk '{print $2}' | xargs kill
-- 
GitLab


From 3ede8b67e6913e19c3db523f25ed5c95c061f321 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 15:20:00 +0800
Subject: [PATCH 6/9] update CMakeLists.txt

---
 CMakeLists.txt                                | 18 ---------
 paddle/CMakeLists.txt                         | 30 +-------------
 .../operators/positive_negative_pair_op.h     |  1 -
 .../sigmoid_cross_entropy_with_logits_op.h    |  1 -
 paddle/testing/CMakeLists.txt                 | 10 +----
 python/CMakeLists.txt                         | 40 +------------------
 python/setup.py.in                            | 32 ---------------
 7 files changed, 5 insertions(+), 127 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6aa8f1b8..a51552d96 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -279,9 +279,6 @@ include(inference_lib)      # add paddle fluid inference libraries
 
 
 include_directories("${PADDLE_SOURCE_DIR}")
-include_directories("${PADDLE_SOURCE_DIR}/paddle/legacy/cuda/include")
-include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
-include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 
 set(EXTERNAL_LIBS
     gflags
@@ -320,21 +317,6 @@ if(USE_NNPACK)
     list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 
-add_subdirectory(proto)
-
-if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
-    # "add_subdirectory(go)" should be placed after the following loine,
-    # because it depends on paddle/optimizer.
-    add_subdirectory(paddle/legacy/optimizer)
-endif()
-
-# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
-# placed after this block, because they depends on it.
-if(WITH_GOLANG)
-    enable_language(Go)
-    add_subdirectory(go)
-endif(WITH_GOLANG)
-
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 6b665a9ef..c0c04d475 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,30 +1,4 @@
-if(NOT WITH_FLUID_ONLY)
-  add_subdirectory(legacy/cuda)
-  add_subdirectory(legacy/function)
-  add_subdirectory(legacy/utils)
-  add_subdirectory(legacy/math)
-  add_subdirectory(legacy/gserver)
-  add_subdirectory(legacy/parameter)
-
-  if(MOBILE_INFERENCE)
-    add_subdirectory(legacy/capi)
-  else()
-    add_subdirectory(legacy/pserver)
-    add_subdirectory(legacy/trainer)
-    add_subdirectory(scripts)
-
-    if(WITH_C_API)
-      add_subdirectory(legacy/capi)
-    endif()
-
-    if(WITH_SWIG_PY)
-      add_subdirectory(legacy/api)
-    endif()
-  endif()
-endif()
-
+add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
-if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
-  add_subdirectory(fluid)
-endif()
+add_subdirectory(fluid)
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index db0a1002f..a47deb18b 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
index b8731c232..6e75f9e0b 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 614596958..dc6245ce6 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,13 +1,5 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  add_library(paddle_test_main STATIC TestMain.cpp)
-  add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
-  if(NOT WIN32)
-    add_library(paddle_test_util STATIC TestUtil.cpp)
-    add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
-  endif(NOT WIN32)
-  if(NOT MOBILE_INFERENCE)
-    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
-  endif()
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
 endif()
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 72c0d03e5..37ad77549 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,27 +4,6 @@ set(PY_FILES paddle/__init__.py
   ${UTILS_PY_FILES}
   ${FLUID_PY_FILES})
 
-if(NOT WITH_FLUID_ONLY)
-  file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
-  file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
-  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/*.py)
-  set(PY_FILES ${PY_FILES}
-    ${TRAINER_PY_FILES}
-    ${HELPERS_PY_FILES}
-    ${V2_PY_FILES})
-
-  add_custom_target(copy_paddle_master)
-
-  SET(COPY_PADDLE_MASTER "")
-  if(WITH_GOLANG)
-    SET(COPY_PADDLE_MASTER "copy_paddle_master")
-    add_custom_command(TARGET ${COPY_PADDLE_MASTER}
-      COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
-      )
-    add_dependencies(copy_paddle_master paddle_master)
-  endif(WITH_GOLANG)
-endif()
-
 set(MKL_SHARED_LIBS "")
 set(MKL_DEPENDS "")
 if(WITH_MKLML)
@@ -64,7 +43,7 @@ IF(WIN32)
             COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
             COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
             COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-            DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+            DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ELSE(WIN32)
 	add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND touch stub.cc
@@ -74,16 +53,10 @@ ELSE(WIN32)
 		COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
 		COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
 		COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-		DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+		DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 ENDIF()
 
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
-if(NOT WITH_FLUID_ONLY)
-    set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
-    if(WITH_SWIG_PY)
-        list(APPEND paddle_python_deps python_api_wheel)
-    endif()
-endif()
 add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
@@ -91,15 +64,6 @@ set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 if (WITH_TESTING)
   add_subdirectory(paddle/reader/tests)
   add_subdirectory(paddle/dataset/tests)
-  if(NOT WITH_FLUID_ONLY)
-    add_subdirectory(paddle/trainer_config_helpers/tests)
-    if (WITH_SWIG_PY)
-      # enable v2 API unittest only when paddle swig api is compiled
-      add_subdirectory(paddle/v2/tests)
-      add_subdirectory(paddle/v2/plot/tests)
-      add_subdirectory(paddle/v2/reader/tests)
-    endif()
-  endif()
   add_subdirectory(paddle/fluid/tests)
   add_subdirectory(paddle/fluid/contrib/tests)
 endif()
diff --git a/python/setup.py.in b/python/setup.py.in
index c9afe6c88..730b2e1f7 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -117,17 +117,6 @@ packages=['paddle',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']
 
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    packages+=['paddle.proto',
-               'paddle.trainer',
-               'paddle.trainer_config_helpers',
-               'paddle.v2',
-               'paddle.v2.master',
-               'paddle.v2.plot',
-               'paddle.v2.reader',
-               'paddle.v2.dataset',
-               'py_paddle']
-
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
 
@@ -136,19 +125,8 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    paddle_bin_dir = 'opt/paddle/bin'
-    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_trainer',
-                   '${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_merge_model',
-                   '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main',
-                   '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-
 package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
 
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_data['paddle.v2.master']=['libpaddle_master' + ext_name]
-    package_data['py_paddle']=['*.py','_swig_paddle' + ext_name]
-
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
     # The paddle.fluid.proto will be generated while compiling.
@@ -157,8 +135,6 @@ package_dir={
     'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
     'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
 }
-if '${WITH_FLUID_ONLY}'== 'OFF':
-    package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
 
 # put all thirdparty libraries in paddle.libs
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
@@ -226,14 +202,6 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         if os.system(command) != 0:
             raise Exception("patch core.%s failed, command: %s" % (ext_name, command))
-        if '${WITH_FLUID_ONLY}'== 'OFF':
-            # change rpath of _swig_paddle.xx.
-            if "@APPLE@" == "1":
-                command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
-            else:
-                command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle" + ext_name
-            if os.system(command) != 0:
-                raise Exception("patch _swig_paddle.%s failed, command: %s" % (ext_name, command))
 
 ext_modules = [Extension('_foo', ['stub.cc'])]
 if os.name == 'nt':
-- 
GitLab


From 193edfa746bab6d4fdf47a2c1944648cdde7d378 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 15:34:43 +0800
Subject: [PATCH 7/9] remove legacy build_android and build_ios

test=develop
---
 .travis.yml                         |   2 -
 paddle/scripts/paddle_build.sh      | 114 +---------------------------
 tools/manylinux1/Dockerfile.android |  55 --------------
 3 files changed, 1 insertion(+), 170 deletions(-)
 delete mode 100644 tools/manylinux1/Dockerfile.android

diff --git a/.travis.yml b/.travis.yml
index 8c2d9f143..87de895dd 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,6 @@ cache:
     - $HOME/.ccache
     - $HOME/.cache/pip
     - $TRAVIS_BUILD_DIR/build/third_party
-    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 services:
@@ -13,7 +12,6 @@ os:
   - linux
 env:
   - JOB=check_style
-  - JOB=build_android
 addons:
   ssh_known_hosts: 13.229.163.131
 before_install:
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 0fb29d4b3..f58e39268 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -26,8 +26,6 @@ function print_usage() {
 
     echo -e "\n${RED}Options${NONE}:
     ${BLUE}build${NONE}: run build for x86 platform
-    ${BLUE}build_android${NONE}: run build for android platform
-    ${BLUE}build_ios${NONE}: run build for ios platform
     ${BLUE}test${NONE}: run all unit tests
     ${BLUE}single_test${NONE}: run a single unit test
     ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
@@ -301,110 +299,6 @@ EOF
     make install -j 8
 }
 
-function build_android() {
-    if [ $ANDROID_ABI == "arm64-v8a" ]; then
-      ANDROID_ARCH=arm64
-      if [ $ANDROID_API -lt 21 ]; then
-        echo "Warning: arm64-v8a requires ANDROID_API >= 21."
-        ANDROID_API=21
-      fi
-    else # armeabi, armeabi-v7a
-      ANDROID_ARCH=arm
-    fi
-
-    ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
-
-    cat <<EOF
-    ============================================
-    Generating the standalone toolchain ...
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
-          --arch=$ANDROID_ARCH
-          --platform=android-$ANDROID_API
-          --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
-    ============================================
-EOF
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
-          --arch=$ANDROID_ARCH \
-          --platform=android-$ANDROID_API \
-          --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-
-    BUILD_ROOT=${PADDLE_ROOT}/build_android
-    DEST_ROOT=${PADDLE_ROOT}/install_android
-
-    mkdir -p $BUILD_ROOT
-    cd $BUILD_ROOT
-
-    if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-      cmake -DCMAKE_SYSTEM_NAME=Android \
-            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-            -DANDROID_ABI=$ANDROID_ABI \
-            -DANDROID_ARM_NEON=ON \
-            -DANDROID_ARM_MODE=ON \
-            -DHOST_C_COMPILER=/usr/bin/gcc \
-            -DHOST_CXX_COMPILER=/usr/bin/g++ \
-            -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-            -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DUSE_EIGEN_FOR_BLAS=ON \
-            -DWITH_C_API=ON \
-            -DWITH_SWIG_PY=OFF \
-            ..
-    elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-      cmake -DCMAKE_SYSTEM_NAME=Android \
-            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-            -DANDROID_ABI=$ANDROID_ABI \
-            -DANDROID_ARM_MODE=ON \
-            -DHOST_C_COMPILER=/usr/bin/gcc \
-            -DHOST_CXX_COMPILER=/usr/bin/g++ \
-            -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-            -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DUSE_EIGEN_FOR_BLAS=OFF \
-            -DWITH_C_API=ON \
-            -DWITH_SWIG_PY=OFF \
-            ..
-    elif [ $ANDROID_ABI == "armeabi" ]; then
-      cmake -DCMAKE_SYSTEM_NAME=Android \
-            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-            -DANDROID_ABI=$ANDROID_ABI \
-            -DANDROID_ARM_MODE=ON \
-            -DHOST_C_COMPILER=/usr/bin/gcc \
-            -DHOST_CXX_COMPILER=/usr/bin/g++ \
-            -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-            -DCMAKE_BUILD_TYPE=MinSizeRel \
-            -DWITH_C_API=ON \
-            -DWITH_SWIG_PY=OFF \
-            ..
-    else
-      echo "Invalid ANDROID_ABI: $ANDROID_ABI"
-    fi
-
-    cat <<EOF
-    ============================================
-    Building in $BUILD_ROOT ...
-    ============================================
-EOF
-    make -j `nproc`
-    make install -j `nproc`
-}
-
-function build_ios() {
-    # Create the build directory for CMake.
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-
-    # Compile paddle binaries
-    cmake .. \
-          -DCMAKE_SYSTEM_NAME=iOS \
-          -DIOS_PLATFORM=OS \
-          -DCMAKE_OSX_ARCHITECTURES="arm64" \
-          -DWITH_C_API=ON \
-          -DUSE_EIGEN_FOR_BLAS=ON \
-          -DWITH_TESTING=OFF \
-          -DWITH_SWIG_PY=OFF \
-          -DCMAKE_BUILD_TYPE=Release
-
-    make -j 2
-}
-
 function run_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -639,7 +533,7 @@ EOF
     case $LIB_TYPE in
       full)
         # Build full Paddle Python module. Will timeout without caching 'copy_paddle_pybind' first
-        make -j `nproc` gen_proto_py framework_py_proto copy_paddle_pybind paddle_python
+        make -j `nproc` framework_py_proto copy_paddle_pybind paddle_python
         ;;
       pybind)
         # Build paddle pybind library. Takes 49 minutes to build. Might timeout
@@ -876,12 +770,6 @@ function main() {
         build
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
-      build_android)
-        build_android
-        ;;
-      build_ios)
-        build_ios
-        ;;
       test)
         run_test
         ;;
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
deleted file mode 100644
index 7eb040902..000000000
--- a/tools/manylinux1/Dockerfile.android
+++ /dev/null
@@ -1,55 +0,0 @@
-FROM ubuntu:16.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-# ENV variables
-ARG ANDROID_ABI
-ARG ANDROID_API
-
-ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
-ENV ANDROID_API=${ANDROID_API:-21}
-
-ENV HOME=/root \
-    ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
-
-RUN apt-get update && \
-    apt-get install -y \
-    git python-dev python-pip python-numpy \
-    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
-    apt-get clean -y
-
-# Install Go and glide
-RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
-    tar -xz -C /usr/local && \
-    mkdir /root/gopath && \
-    mkdir /root/gopath/bin && \
-    mkdir /root/gopath/src
-ENV GOROOT=/usr/local/go GOPATH=/root/gopath
-# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-RUN pip install --upgrade pip==9.0.3 && \
-    pip install -U 'protobuf==3.1.0' && \
-    pip install -U wheel sphinx && \
-    pip install pre-commit
-
-# Android NDK
-RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
-    mkdir -p /opt/android-ndk-tmp && \
-    cd /opt/android-ndk-tmp && \
-    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
-    unzip -q android-ndk-r14b-linux-x86_64.zip && \
-    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    rm -rf /opt/android-ndk-tmp
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
-
-- 
GitLab


From bbd921c32210ba94904066bd4eec8669a0ca0f97 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 16:12:37 +0800
Subject: [PATCH 8/9] recover glide for check_style

test=develop
---
 go/glide.lock | 233 ++++++++++++++++++++++++++++++++++++++++++++++++++
 go/glide.yaml |  33 +++++++
 2 files changed, 266 insertions(+)
 create mode 100644 go/glide.lock
 create mode 100644 go/glide.yaml

diff --git a/go/glide.lock b/go/glide.lock
new file mode 100644
index 000000000..d15fc934d
--- /dev/null
+++ b/go/glide.lock
@@ -0,0 +1,233 @@
+hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19
+updated: 2017-10-30T03:46:19.137696069Z
+imports:
+- name: github.com/alecthomas/gometalinter
+  version: bae2f1293d092fd8167939d5108d1b025eaef9de
+- name: github.com/beorn7/perks
+  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
+  subpackages:
+  - quantile
+- name: github.com/boltdb/bolt
+  version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
+- name: github.com/cockroachdb/cmux
+  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
+- name: github.com/coreos/etcd
+  version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b
+  subpackages:
+  - alarm
+  - auth
+  - auth/authpb
+  - client
+  - clientv3
+  - clientv3/concurrency
+  - compactor
+  - discovery
+  - embed
+  - error
+  - etcdserver
+  - etcdserver/api
+  - etcdserver/api/etcdhttp
+  - etcdserver/api/v2http
+  - etcdserver/api/v2http/httptypes
+  - etcdserver/api/v3client
+  - etcdserver/api/v3election
+  - etcdserver/api/v3election/v3electionpb
+  - etcdserver/api/v3election/v3electionpb/gw
+  - etcdserver/api/v3lock
+  - etcdserver/api/v3lock/v3lockpb
+  - etcdserver/api/v3lock/v3lockpb/gw
+  - etcdserver/api/v3rpc
+  - etcdserver/api/v3rpc/rpctypes
+  - etcdserver/auth
+  - etcdserver/etcdserverpb
+  - etcdserver/etcdserverpb/gw
+  - etcdserver/membership
+  - etcdserver/stats
+  - lease
+  - lease/leasehttp
+  - lease/leasepb
+  - mvcc
+  - mvcc/backend
+  - mvcc/mvccpb
+  - pkg/adt
+  - pkg/contention
+  - pkg/cors
+  - pkg/cpuutil
+  - pkg/crc
+  - pkg/debugutil
+  - pkg/fileutil
+  - pkg/httputil
+  - pkg/idutil
+  - pkg/ioutil
+  - pkg/logutil
+  - pkg/monotime
+  - pkg/netutil
+  - pkg/pathutil
+  - pkg/pbutil
+  - pkg/runtime
+  - pkg/schedule
+  - pkg/srv
+  - pkg/tlsutil
+  - pkg/transport
+  - pkg/types
+  - pkg/wait
+  - proxy/grpcproxy/adapter
+  - raft
+  - raft/raftpb
+  - rafthttp
+  - snap
+  - snap/snappb
+  - store
+  - version
+  - wal
+  - wal/walpb
+- name: github.com/coreos/go-semver
+  version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
+  subpackages:
+  - semver
+- name: github.com/coreos/go-systemd
+  version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
+  subpackages:
+  - daemon
+  - journal
+  - util
+- name: github.com/coreos/pkg
+  version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
+  subpackages:
+  - capnslog
+- name: github.com/dgrijalva/jwt-go
+  version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
+- name: github.com/ghodss/yaml
+  version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
+- name: github.com/go-stack/stack
+  version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf
+- name: github.com/gogo/protobuf
+  version: 909568be09de550ed094403c2bf8a261b5bb730a
+  subpackages:
+  - proto
+- name: github.com/golang/protobuf
+  version: 4bd1920723d7b7c925de087aa32e2187708897f7
+  subpackages:
+  - jsonpb
+  - proto
+- name: github.com/golang/snappy
+  version: 553a641470496b2327abcac10b36396bd98e45c9
+- name: github.com/google/btree
+  version: 925471ac9e2131377a91e1595defec898166fe49
+- name: github.com/grpc-ecosystem/go-grpc-prometheus
+  version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
+- name: github.com/grpc-ecosystem/grpc-gateway
+  version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
+  subpackages:
+  - runtime
+  - runtime/internal
+  - utilities
+- name: github.com/inconshreveable/log15
+  version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3
+- name: github.com/jonboulle/clockwork
+  version: 2eee05ed794112d45db504eb05aa693efd2b8b09
+- name: github.com/mattn/go-colorable
+  version: 5411d3eea5978e6cdc258b30de592b60df6aba96
+- name: github.com/mattn/go-isatty
+  version: 57fdcb988a5c543893cc61bce354a6e24ab70022
+- name: github.com/matttproud/golang_protobuf_extensions
+  version: c12348ce28de40eed0136aa2b644d0ee0650e56c
+  subpackages:
+  - pbutil
+- name: github.com/namsral/flag
+  version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
+- name: github.com/PaddlePaddle/recordio
+  version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
+- name: github.com/prometheus/client_golang
+  version: c5b7fccd204277076155f10851dad72b76a49317
+  subpackages:
+  - prometheus
+- name: github.com/prometheus/client_model
+  version: 6f3806018612930941127f2a7c6c453ba2c527d2
+  subpackages:
+  - go
+- name: github.com/prometheus/common
+  version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
+  subpackages:
+  - expfmt
+  - internal/bitbucket.org/ww/goautoneg
+  - model
+- name: github.com/prometheus/procfs
+  version: a1dba9ce8baed984a2495b658c82687f8157b98f
+  subpackages:
+  - xfs
+- name: github.com/satori/go.uuid
+  version: 879c5887cd475cd7864858769793b2ceb0d44feb
+- name: github.com/sirupsen/logrus
+  version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e
+- name: github.com/topicai/candy
+  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
+- name: github.com/ugorji/go
+  version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
+  subpackages:
+  - codec
+- name: github.com/xiang90/probing
+  version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
+- name: golang.org/x/crypto
+  version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+  subpackages:
+  - bcrypt
+  - blowfish
+  - ssh/terminal
+- name: golang.org/x/net
+  version: c8c74377599bd978aee1cf3b9b63a8634051cec2
+  subpackages:
+  - context
+  - http2
+  - http2/hpack
+  - idna
+  - internal/timeseries
+  - lex/httplex
+  - trace
+- name: golang.org/x/sys
+  version: e48874b42435b4347fc52bdee0424a52abc974d7
+  repo: https://github.com/golang/sys.git
+  vcs: git
+  subpackages:
+  - unix
+  - windows
+- name: golang.org/x/text
+  version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
+  repo: https://github.com/golang/text.git
+  vcs: git
+  subpackages:
+  - secure/bidirule
+  - transform
+  - unicode/bidi
+  - unicode/norm
+- name: google.golang.org/grpc
+  version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d
+  subpackages:
+  - codes
+  - credentials
+  - grpclog
+  - internal
+  - keepalive
+  - metadata
+  - naming
+  - peer
+  - stats
+  - tap
+  - transport
+- name: gopkg.in/yaml.v2
+  version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
+testImports:
+- name: github.com/davecgh/go-spew
+  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
+  subpackages:
+  - spew
+- name: github.com/pmezard/go-difflib
+  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
+  subpackages:
+  - difflib
+- name: github.com/stretchr/testify
+  version: 05e8a0eda380579888eb53c394909df027f06991
+  subpackages:
+  - assert
diff --git a/go/glide.yaml b/go/glide.yaml
new file mode 100644
index 000000000..c5d66694a
--- /dev/null
+++ b/go/glide.yaml
@@ -0,0 +1,33 @@
+package: github.com/PaddlePaddle/Paddle/go
+import:
+- package: github.com/PaddlePaddle/recordio
+- package: github.com/coreos/etcd
+  version: ^3.2.1
+  subpackages:
+  - clientv3
+  - clientv3/concurrency
+  - embed
+  - etcdserver
+- package: github.com/namsral/flag
+  version: ^1.7.4-pre
+- package: github.com/sirupsen/logrus
+  version: ^1.0.0
+- package: github.com/topicai/candy
+- package: golang.org/x/crypto
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+- package: golang.org/x/sys
+  repo: https://github.com/golang/sys.git
+  vcs: git
+- package: golang.org/x/text
+  repo: https://github.com/golang/text.git
+  vcs: git
+- package: github.com/satori/go.uuid
+  version: v1.1.0
+- package: github.com/alecthomas/gometalinter
+  version: v1.2.1
+- package: github.com/inconshreveable/log15
+  version: v2.13
+- package: github.com/go-stack/stack
+  version: v1.6.0
+- package: github.com/golang/protobuf
-- 
GitLab


From c102f427d2eea147e75c69213c2e4253feb9071c Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 18 Jan 2019 22:02:04 +0800
Subject: [PATCH 9/9] make 'paddle version' valid

test=develop
---
 paddle/scripts/submit_local.sh.in | 36 +------------------------------
 python/setup.py.in                |  2 ++
 2 files changed, 3 insertions(+), 35 deletions(-)

diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 622a2d510..1f421f248 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -1,19 +1,5 @@
 #!/bin/bash
 
-function usage(){
-        echo "usage: paddle [--help] [<args>]"
-        echo "These are common paddle commands used in various situations:"
-        echo "    train             Start a paddle_trainer"
-        echo "    merge_model       Start a paddle_merge_model"
-        echo "    pserver           Start a paddle_pserver_main"
-        echo "    version           Print paddle version"
-        echo "    dump_config       Dump the trainer config as proto string"
-        echo "    make_diagram      Make Diagram using Graphviz"
-        echo ""
-        echo "'paddle train --help' 'paddle merge_model --help', 'paddle pserver --help', list more detailed usage of each command"
-}
-
-
 function version(){
         echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
         echo "    with_avx: @WITH_AVX@"
@@ -177,30 +163,10 @@ cpu_config
 # echo $KMP_AFFINITY $OMP_DYNAMIC
 
 case "$1" in
-    "train")
-        threads_config $@
-        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS
-        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
-        ;;
-    "merge_model")
-        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_merge_model ${@:2}
-        ;;
-    "pserver")
-        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_pserver_main ${@:2}
-        ;;
-    "dump_config")
-        python -m paddle.utils.dump_config ${@:2}
-        ;;
-    "make_diagram")
-        python -m paddle.utils.make_model_diagram ${@:2}
-        ;;
     "version")
         version
         ;;
-    "--help")
-        usage
-        ;;
     *)
-        usage
+        version
         ;;
  esac
diff --git a/python/setup.py.in b/python/setup.py.in
index 730b2e1f7..e00c88b3a 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -125,6 +125,8 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
+if not '${WIN32}':
+    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
 package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
 
 package_dir={
-- 
GitLab